1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
//! SIMD Capabilities Detection and Utilities
//!
//! This module provides platform capabilities detection and optimization utilities
//! for making runtime decisions about which SIMD algorithms to use.
/// Platform capabilities for optimization decisions
#[derive(Debug, Clone, Copy)]
pub struct SimdCapabilities {
pub has_auto_vectorization: bool,
pub has_target_features: bool,
pub recommended_unroll_factor: usize,
pub cache_line_size: usize,
}
/// SIMD capabilities detection and utilities
pub struct Capabilities;
impl Capabilities {
/// Platform-specific capabilities detection (stable approach)
pub fn detect_capabilities() -> SimdCapabilities {
SimdCapabilities {
has_auto_vectorization: true, // Compiler can auto-vectorize
has_target_features: Self::has_target_feature_support(),
recommended_unroll_factor: 8,
cache_line_size: Self::estimate_cache_line_size(),
}
}
/// Detect if target-specific features are available
fn has_target_feature_support() -> bool {
// Use stable feature detection when possible
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
// On x86/x86_64, we can detect some features
Self::detect_x86_features()
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
// For other architectures, assume basic vectorization support
true
}
}
/// Detect x86-specific SIMD features
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn detect_x86_features() -> bool {
// Use is_x86_feature_detected! macro for runtime detection
#[cfg(target_feature = "sse2")]
{
true // SSE2 is baseline for x86_64
}
#[cfg(not(target_feature = "sse2"))]
{
// Runtime detection for SSE2
std::arch::is_x86_feature_detected!("sse2")
}
}
/// Estimate cache line size for optimization
pub fn estimate_cache_line_size() -> usize {
// Most modern processors use 64-byte cache lines
64
}
/// Get optimal block size for cache-friendly algorithms
pub fn optimal_block_size(data_size: usize, element_size: usize) -> usize {
let cache_line_size = Self::estimate_cache_line_size();
let l1_cache_size = 32 * 1024; // Typical L1 cache size (32KB)
// Calculate block size that fits well in L1 cache
let max_block_elements = l1_cache_size / element_size / 4; // Use 1/4 of L1 cache
let cache_line_elements = cache_line_size / element_size;
// Round to cache line boundaries
let optimal_size = (max_block_elements / cache_line_elements) * cache_line_elements;
// Ensure it's not larger than the data itself and not too small
optimal_size.min(data_size).max(cache_line_elements * 4)
}
/// Determine optimal unroll factor based on data size
pub fn optimal_unroll_factor(data_size: usize) -> usize {
match data_size {
0..=32 => 1, // Very small: no unrolling
33..=128 => 2, // Small: minimal unrolling
129..=512 => 4, // Medium: moderate unrolling
513..=2048 => 8, // Large: standard unrolling
_ => 16, // Very large: aggressive unrolling
}
}
/// Check if SIMD optimizations are beneficial for given size
pub fn should_use_simd(data_size: usize) -> bool {
// SIMD optimizations are typically beneficial for arrays larger than 8 elements
data_size >= 8
}
/// Get memory alignment requirement for optimal SIMD performance
pub fn memory_alignment_requirement() -> usize {
// Most SIMD operations benefit from 16-byte or 32-byte alignment
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
32 // AVX requires 32-byte alignment for optimal performance
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
{
16 // Default to 16-byte alignment
}
}
/// Check if data is properly aligned for SIMD operations
pub fn is_aligned(ptr: *const f32) -> bool {
let alignment = Self::memory_alignment_requirement();
(ptr as usize) % alignment == 0
}
/// Calculate the number of SIMD lanes for f32 operations
pub fn simd_lanes_f32() -> usize {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if Self::has_avx2_support() {
8 // AVX2 can process 8 f32 values at once
} else if Self::has_sse2_support() {
4 // SSE2 can process 4 f32 values at once
} else {
1 // Fallback to scalar
}
}
#[cfg(target_arch = "aarch64")]
{
4 // NEON can process 4 f32 values at once
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
{
1 // Fallback to scalar for other architectures
}
}
/// Check for AVX2 support (x86/x86_64 only)
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn has_avx2_support() -> bool {
std::arch::is_x86_feature_detected!("avx2")
}
/// Check for SSE2 support (x86/x86_64 only)
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn has_sse2_support() -> bool {
std::arch::is_x86_feature_detected!("sse2")
}
/// Get platform-specific performance hints
pub fn get_performance_hints() -> PerformanceHints {
PerformanceHints {
prefer_unrolling: Self::should_prefer_unrolling(),
prefer_blocking: Self::should_prefer_blocking(),
optimal_chunk_size: Self::get_optimal_chunk_size(),
memory_prefetch_distance: Self::get_prefetch_distance(),
}
}
/// Determine if loop unrolling is beneficial on this platform
fn should_prefer_unrolling() -> bool {
// Modern CPUs generally benefit from loop unrolling
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
{
true
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
{
false // Conservative default for unknown architectures
}
}
/// Determine if cache blocking is beneficial on this platform
fn should_prefer_blocking() -> bool {
// Blocking is beneficial for most architectures with hierarchical caches
true
}
/// Get optimal chunk size for processing
fn get_optimal_chunk_size() -> usize {
let simd_lanes = Self::simd_lanes_f32();
let cache_line_elements = Self::estimate_cache_line_size() / std::mem::size_of::<f32>();
// Choose chunk size that's a multiple of SIMD lanes and fits cache line
let base_chunk = simd_lanes.max(cache_line_elements);
// Round up to next power of 2 for better performance
base_chunk.next_power_of_two().min(64) // Cap at reasonable size
}
/// Get memory prefetch distance
fn get_prefetch_distance() -> usize {
// Prefetch a few cache lines ahead
Self::estimate_cache_line_size() * 4
}
}
/// Performance optimization hints for the current platform
#[derive(Debug, Clone)]
pub struct PerformanceHints {
pub prefer_unrolling: bool,
pub prefer_blocking: bool,
pub optimal_chunk_size: usize,
pub memory_prefetch_distance: usize,
}
impl Default for PerformanceHints {
fn default() -> Self {
Capabilities::get_performance_hints()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_capabilities() {
let capabilities = Capabilities::detect_capabilities();
// Basic sanity checks
assert!(capabilities.recommended_unroll_factor > 0);
assert!(capabilities.cache_line_size > 0);
assert!(capabilities.cache_line_size.is_power_of_two());
}
#[test]
fn test_optimal_block_size() {
let data_size = 1024;
let element_size = std::mem::size_of::<f32>();
let block_size = Capabilities::optimal_block_size(data_size, element_size);
// Block size should be reasonable
assert!(block_size > 0);
assert!(block_size <= data_size);
// Should be a multiple of cache line elements for efficiency
let cache_line_elements = Capabilities::estimate_cache_line_size() / element_size;
assert_eq!(block_size % cache_line_elements, 0);
}
#[test]
fn test_optimal_unroll_factor() {
// Test different data sizes
assert_eq!(Capabilities::optimal_unroll_factor(16), 1);
assert_eq!(Capabilities::optimal_unroll_factor(64), 2);
assert_eq!(Capabilities::optimal_unroll_factor(256), 4);
assert_eq!(Capabilities::optimal_unroll_factor(1024), 8);
assert_eq!(Capabilities::optimal_unroll_factor(4096), 16);
}
#[test]
fn test_should_use_simd() {
// Small arrays shouldn't use SIMD
assert!(!Capabilities::should_use_simd(4));
assert!(!Capabilities::should_use_simd(7));
// Larger arrays should use SIMD
assert!(Capabilities::should_use_simd(8));
assert!(Capabilities::should_use_simd(32));
assert!(Capabilities::should_use_simd(1024));
}
#[test]
fn test_memory_alignment_requirement() {
let alignment = Capabilities::memory_alignment_requirement();
// Should be a power of 2 and at least 8 bytes
assert!(alignment.is_power_of_two());
assert!(alignment >= 8);
}
#[test]
fn test_simd_lanes_f32() {
let lanes = Capabilities::simd_lanes_f32();
// Should be at least 1 and a power of 2
assert!(lanes >= 1);
assert!(lanes.is_power_of_two());
}
#[test]
fn test_performance_hints() {
let hints = Capabilities::get_performance_hints();
// Basic sanity checks
assert!(hints.optimal_chunk_size > 0);
assert!(hints.optimal_chunk_size.is_power_of_two());
assert!(hints.memory_prefetch_distance > 0);
}
#[test]
fn test_cache_line_size() {
let cache_line_size = Capabilities::estimate_cache_line_size();
// Should be a reasonable power of 2
assert!(cache_line_size.is_power_of_two());
assert!(cache_line_size >= 32);
assert!(cache_line_size <= 256);
}
}