1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
//! Performance comparison tests between SIMD and scalar implementations
//!
//! Benchmarks SIMD operations against scalar implementations to validate performance.
#[cfg(feature = "std")]
mod performance_tests {
use crate::simd::dispatch::*;
use crate::simd::scalar::*;
use std::time::{Duration, Instant};
use std::vec::Vec;
/// Benchmark harness for consistent performance measurement
fn benchmark_operation<F>(
iterations: usize,
setup_fn: F,
operation_fn: &mut dyn FnMut(&mut [u64; 16]),
) -> Duration
where
F: Fn() -> [u64; 16],
{
let mut total_time = Duration::new(0, 0);
for _ in 0..iterations {
let mut data = setup_fn();
let start = Instant::now();
operation_fn(&mut data);
total_time += start.elapsed();
}
total_time / iterations as u32
}
#[test]
fn test_simd_performance_basic() {
let iterations = 1000;
// Benchmark scalar implementation
let scalar_time =
benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
scalar_clock_mix(data)
});
// Benchmark SIMD implementation
let simd_time =
benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
clock_mix_avx2(data)
});
// Performance test results:
// Scalar time: {:?}
// SIMD time: {:?}
// (Print statements removed for no_std compatibility)
// SIMD should be at least as fast as scalar (allowing for some variance)
// Note: This might fail on systems without SIMD support or in debug builds
// In debug builds, SIMD can be slower due to lack of optimization
#[cfg(not(debug_assertions))]
assert!(
simd_time <= scalar_time * 2,
"SIMD should not be more than 2x slower than scalar (SIMD: {:?}, Scalar: {:?})",
simd_time,
scalar_time
);
// In debug builds, just ensure SIMD produces correct results and doesn't crash
#[cfg(debug_assertions)]
{
// SIMD can be much slower in debug builds due to lack of optimization
assert!(
simd_time <= scalar_time * 100,
"SIMD should not be more than 100x slower than scalar in debug builds (SIMD: {:?}, Scalar: {:?})",
simd_time,
scalar_time
);
}
}
#[test]
fn test_performance_scaling() {
// Test performance scaling with different data sizes
let sizes = [100, 500, 1000, 5000];
for &iterations in &sizes {
let scalar_time =
benchmark_operation(iterations, || [0xFEDCBA9876543210u64; 16], &mut |data| {
scalar_clock_mix(data)
});
let simd_time =
benchmark_operation(iterations, || [0xFEDCBA9876543210u64; 16], &mut |data| {
clock_mix_avx2(data)
});
let speedup = scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64;
// Performance scaling ({} iterations):
// Speedup: {:.2}x
// (Print statements removed for no_std compatibility)
// SIMD should show some benefit at larger iteration counts
// In debug builds, SIMD might be slower due to lack of optimization
#[cfg(not(debug_assertions))]
if iterations >= 1000 {
assert!(
speedup >= 0.5,
"SIMD should not be more than 2x slower at high iteration counts"
);
}
#[cfg(debug_assertions)]
if iterations >= 1000 {
// In debug builds, SIMD is often slower due to lack of optimization
// Just ensure it's not completely broken (not more than 100x slower)
assert!(
speedup >= 0.01,
"SIMD should not be more than 100x slower at high iteration counts in debug builds"
);
}
}
}
#[test]
fn test_block_processing_performance() {
let iterations = 100;
let test_block = [0xABu8; 128];
// Benchmark scalar block processing
let mut scalar_state = crate::constants::IV;
let scalar_start = Instant::now();
for _ in 0..iterations {
process_block_simd_scalar(&test_block, &mut scalar_state);
}
let scalar_time = scalar_start.elapsed();
// Benchmark SIMD block processing
let mut simd_state = crate::constants::IV;
let simd_start = Instant::now();
for _ in 0..iterations {
process_block_simd(&test_block, &mut simd_state);
}
let simd_time = simd_start.elapsed();
// Block processing performance:
// Scalar: {:?}
// SIMD: {:?}
// (Print statements removed for no_std compatibility)
// States should be identical
assert_eq!(scalar_state, simd_state);
// SIMD should be reasonably performant
assert!(
simd_time <= scalar_time * 3,
"SIMD block processing should not be more than 3x slower"
);
}
#[test]
fn test_memory_access_patterns() {
// Test performance with different memory access patterns
let patterns: Vec<(&str, Box<dyn Fn(usize) -> u64>)> = vec![
(
"sequential",
Box::new(|i: usize| i as u64) as Box<dyn Fn(usize) -> u64>,
),
(
"random",
Box::new(|i: usize| (i as u64).wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7))
as Box<dyn Fn(usize) -> u64>,
),
(
"sparse",
Box::new(|i: usize| if i % 3 == 0 { i as u64 } else { 0 })
as Box<dyn Fn(usize) -> u64>,
),
(
"dense",
Box::new(|i: usize| u64::MAX ^ (i as u64)) as Box<dyn Fn(usize) -> u64>,
),
];
for (name, pattern_fn) in patterns {
let iterations = 500;
let scalar_time = benchmark_operation(
iterations,
|| {
let mut data = [0u64; 16];
for i in 0..16 {
data[i] = pattern_fn(i);
}
data
},
&mut |data| scalar_clock_mix(data),
);
let simd_time = benchmark_operation(
iterations,
|| {
let mut data = [0u64; 16];
for i in 0..16 {
data[i] = pattern_fn(i);
}
data
},
&mut |data| clock_mix_avx2(data),
);
// println! removed for no_std compatibility("Memory pattern '{}' performance:", name);
// println! removed for no_std compatibility(" Scalar: {:?}", scalar_time);
// println! removed for no_std compatibility(" SIMD: {:?}", simd_time);
// Just ensure both complete without issues
assert!(simd_time > Duration::new(0, 0));
assert!(scalar_time > Duration::new(0, 0));
}
}
#[test]
fn test_cpu_feature_performance_impact() {
// Test performance differences based on available CPU features
let iterations = 1000;
let time = benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
clock_mix_avx2(data)
});
// println! removed for no_std compatibility("SIMD performance with current CPU features:");
// println! removed for no_std compatibility(" AVX2 available: {}", crate::simd::dispatch::is_avx2_available());
// println! removed for no_std compatibility(" AVX-512 available: {}", crate::simd::dispatch::is_avx512_available());
// println! removed for no_std compatibility(" Time per operation: {:?}", time / iterations as u32);
// Should complete in reasonable time
assert!(
time < Duration::from_secs(1),
"SIMD operations should complete within 1 second for {} iterations",
iterations
);
}
#[test]
fn test_performance_regression_detection() {
// This test can be used to detect performance regressions
// by comparing against known good performance baselines
let iterations = 10000;
let baseline_threshold = Duration::from_millis(100); // Adjust based on expected performance
let time = benchmark_operation(iterations, || [0xDEADBEEFDEADBEEFu64; 16], &mut |data| {
clock_mix_avx2(data)
});
// println! removed for no_std compatibility("Performance regression test:");
// println! removed for no_std compatibility(" Total time for {} iterations: {:?}", iterations, time);
// println! removed for no_std compatibility(" Time per iteration: {:?}", time / iterations as u32);
// Should be reasonably fast
assert!(
time < baseline_threshold,
"Performance regression detected: took {:?} for {} iterations",
time,
iterations
);
}
#[test]
fn test_throughput_comparison() {
// Test data throughput (bytes processed per second)
let iterations = 1000;
let bytes_per_operation = 16 * 8; // 16 u64 * 8 bytes
let time = benchmark_operation(iterations, || [0xAAAAAAAAAAAAAAAAu64; 16], &mut |data| {
clock_mix_avx2(data)
});
let total_bytes = (iterations * bytes_per_operation) as u128;
let time_seconds = time.as_nanos() as f64 / 1_000_000_000.0;
let throughput_bytes_per_sec = total_bytes as f64 / time_seconds;
let throughput_mb_per_sec = throughput_bytes_per_sec / (1024.0 * 1024.0);
// println! removed for no_std compatibility("Throughput test:");
// println! removed for no_std compatibility(" Processed: {} bytes in {:?}", total_bytes, time);
// println! removed for no_std compatibility(" Throughput: {:.2} MB/s", throughput_mb_per_sec);
// Should process at least 1 MB/s (very conservative baseline)
assert!(
throughput_mb_per_sec > 1.0,
"Throughput too low: {:.2} MB/s",
throughput_mb_per_sec
);
}
#[test]
fn test_debug_performance_measurement() {
#[cfg(feature = "debug")]
{
// Test the debug performance measurement utilities
let data = [1u64, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
let result = validate_simd_vs_scalar(
"performance_test",
&data,
|data| clock_mix_avx2(data),
scalar_clock_mix,
)
.expect("Performance measurement should succeed");
// println! removed for no_std compatibility("Debug performance measurement:");
// println! removed for no_std compatibility(" Operation: {}", result.operation);
// println! removed for no_std compatibility(" Execution time: {:?}", result.execution_time);
// println! removed for no_std compatibility(" CPU features: {}", result.cpu_features);
// Should have measured some time
assert!(result.execution_time > Duration::new(0, 0));
assert!(result.output == result.scalar_output);
}
}
/// Scalar version of process_block_simd for testing
fn process_block_simd_scalar(block: &[u8; 128], state: &mut [u64; 8]) {
// Parse block to 16 u64 words (little-endian)
let mut words = [0u64; 16];
for i in 0..16 {
let offset = i * 8;
words[i] = u64::from_le_bytes([
block[offset],
block[offset + 1],
block[offset + 2],
block[offset + 3],
block[offset + 4],
block[offset + 5],
block[offset + 6],
block[offset + 7],
]);
}
// Apply ClockMix
scalar_clock_mix(&mut words);
// Inject into state
for i in 0..8 {
state[i] = state[i].wrapping_add(words[i]);
let rot_idx = (i + 4) % 8;
state[i] ^= crate::utils::rotl64(state[rot_idx], 17);
}
crate::clockpermute::clock_permute(state);
}
}