clock-hash 1.0.0

//! Performance comparison tests between SIMD and scalar implementations
//!
//! Benchmarks SIMD operations against scalar implementations to validate performance.

#[cfg(feature = "std")]
mod performance_tests {
    use crate::simd::dispatch::*;
    use crate::simd::scalar::*;
    use std::time::{Duration, Instant};
    use std::vec::Vec;

    /// Benchmark harness for consistent performance measurement
    fn benchmark_operation<F>(
        iterations: usize,
        setup_fn: F,
        operation_fn: &mut dyn FnMut(&mut [u64; 16]),
    ) -> Duration
    where
        F: Fn() -> [u64; 16],
    {
        let mut total_time = Duration::new(0, 0);

        for _ in 0..iterations {
            let mut data = setup_fn();
            let start = Instant::now();
            operation_fn(&mut data);
            total_time += start.elapsed();
        }

        total_time / iterations as u32
    }

    #[test]
    fn test_simd_performance_basic() {
        let iterations = 1000;

        // Benchmark scalar implementation
        let scalar_time =
            benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
                scalar_clock_mix(data)
            });

        // Benchmark SIMD implementation
        let simd_time =
            benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
                clock_mix_avx2(data)
            });

        // Performance test results:
        // Scalar time: {:?}
        // SIMD time: {:?}
        // (Print statements removed for no_std compatibility)

        // SIMD should be at least as fast as scalar (allowing for some variance)
        // Note: This might fail on systems without SIMD support or in debug builds
        // In debug builds, SIMD can be slower due to lack of optimization
        #[cfg(not(debug_assertions))]
        assert!(
            simd_time <= scalar_time * 2,
            "SIMD should not be more than 2x slower than scalar (SIMD: {:?}, Scalar: {:?})",
            simd_time,
            scalar_time
        );

        // In debug builds, just ensure SIMD produces correct results and doesn't crash
        #[cfg(debug_assertions)]
        {
            // SIMD can be much slower in debug builds due to lack of optimization
            assert!(
                simd_time <= scalar_time * 100,
                "SIMD should not be more than 100x slower than scalar in debug builds (SIMD: {:?}, Scalar: {:?})",
                simd_time,
                scalar_time
            );
        }
    }

    #[test]
    fn test_performance_scaling() {
        // Test performance scaling with different data sizes
        let sizes = [100, 500, 1000, 5000];

        for &iterations in &sizes {
            let scalar_time =
                benchmark_operation(iterations, || [0xFEDCBA9876543210u64; 16], &mut |data| {
                    scalar_clock_mix(data)
                });

            let simd_time =
                benchmark_operation(iterations, || [0xFEDCBA9876543210u64; 16], &mut |data| {
                    clock_mix_avx2(data)
                });

            let speedup = scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64;

            // Performance scaling ({} iterations):
            // Speedup: {:.2}x
            // (Print statements removed for no_std compatibility)

            // SIMD should show some benefit at larger iteration counts
            // In debug builds, SIMD might be slower due to lack of optimization
            #[cfg(not(debug_assertions))]
            if iterations >= 1000 {
                assert!(
                    speedup >= 0.5,
                    "SIMD should not be more than 2x slower at high iteration counts"
                );
            }

            #[cfg(debug_assertions)]
            if iterations >= 1000 {
                // In debug builds, SIMD is often slower due to lack of optimization
                // Just ensure it's not completely broken (not more than 100x slower)
                assert!(
                    speedup >= 0.01,
                    "SIMD should not be more than 100x slower at high iteration counts in debug builds"
                );
            }
        }
    }

    #[test]
    fn test_block_processing_performance() {
        let iterations = 100;
        let test_block = [0xABu8; 128];

        // Benchmark scalar block processing
        let mut scalar_state = crate::constants::IV;
        let scalar_start = Instant::now();
        for _ in 0..iterations {
            process_block_simd_scalar(&test_block, &mut scalar_state);
        }
        let scalar_time = scalar_start.elapsed();

        // Benchmark SIMD block processing
        let mut simd_state = crate::constants::IV;
        let simd_start = Instant::now();
        for _ in 0..iterations {
            process_block_simd(&test_block, &mut simd_state);
        }
        let simd_time = simd_start.elapsed();

        // Block processing performance:
        // Scalar: {:?}
        // SIMD: {:?}
        // (Print statements removed for no_std compatibility)

        // States should be identical
        assert_eq!(scalar_state, simd_state);

        // SIMD should be reasonably performant
        assert!(
            simd_time <= scalar_time * 3,
            "SIMD block processing should not be more than 3x slower"
        );
    }

    #[test]
    fn test_memory_access_patterns() {
        // Test performance with different memory access patterns
        let patterns: Vec<(&str, Box<dyn Fn(usize) -> u64>)> = vec![
            (
                "sequential",
                Box::new(|i: usize| i as u64) as Box<dyn Fn(usize) -> u64>,
            ),
            (
                "random",
                Box::new(|i: usize| (i as u64).wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7))
                    as Box<dyn Fn(usize) -> u64>,
            ),
            (
                "sparse",
                Box::new(|i: usize| if i % 3 == 0 { i as u64 } else { 0 })
                    as Box<dyn Fn(usize) -> u64>,
            ),
            (
                "dense",
                Box::new(|i: usize| u64::MAX ^ (i as u64)) as Box<dyn Fn(usize) -> u64>,
            ),
        ];

        for (name, pattern_fn) in patterns {
            let iterations = 500;

            let scalar_time = benchmark_operation(
                iterations,
                || {
                    let mut data = [0u64; 16];
                    for i in 0..16 {
                        data[i] = pattern_fn(i);
                    }
                    data
                },
                &mut |data| scalar_clock_mix(data),
            );

            let simd_time = benchmark_operation(
                iterations,
                || {
                    let mut data = [0u64; 16];
                    for i in 0..16 {
                        data[i] = pattern_fn(i);
                    }
                    data
                },
                &mut |data| clock_mix_avx2(data),
            );

            // println! removed for no_std compatibility("Memory pattern '{}' performance:", name);
            // println! removed for no_std compatibility("  Scalar: {:?}", scalar_time);
            // println! removed for no_std compatibility("  SIMD: {:?}", simd_time);

            // Just ensure both complete without issues
            assert!(simd_time > Duration::new(0, 0));
            assert!(scalar_time > Duration::new(0, 0));
        }
    }

    #[test]
    fn test_cpu_feature_performance_impact() {
        // Test performance differences based on available CPU features
        let iterations = 1000;

        let time = benchmark_operation(iterations, || [0x123456789ABCDEF0u64; 16], &mut |data| {
            clock_mix_avx2(data)
        });

        // println! removed for no_std compatibility("SIMD performance with current CPU features:");
        // println! removed for no_std compatibility("  AVX2 available: {}", crate::simd::dispatch::is_avx2_available());
        // println! removed for no_std compatibility("  AVX-512 available: {}", crate::simd::dispatch::is_avx512_available());
        // println! removed for no_std compatibility("  Time per operation: {:?}", time / iterations as u32);

        // Should complete in reasonable time
        assert!(
            time < Duration::from_secs(1),
            "SIMD operations should complete within 1 second for {} iterations",
            iterations
        );
    }

    #[test]
    fn test_performance_regression_detection() {
        // This test can be used to detect performance regressions
        // by comparing against known good performance baselines

        let iterations = 10000;
        let baseline_threshold = Duration::from_millis(100); // Adjust based on expected performance

        let time = benchmark_operation(iterations, || [0xDEADBEEFDEADBEEFu64; 16], &mut |data| {
            clock_mix_avx2(data)
        });

        // println! removed for no_std compatibility("Performance regression test:");
        // println! removed for no_std compatibility("  Total time for {} iterations: {:?}", iterations, time);
        // println! removed for no_std compatibility("  Time per iteration: {:?}", time / iterations as u32);

        // Should be reasonably fast
        assert!(
            time < baseline_threshold,
            "Performance regression detected: took {:?} for {} iterations",
            time,
            iterations
        );
    }

    #[test]
    fn test_throughput_comparison() {
        // Test data throughput (bytes processed per second)
        let iterations = 1000;
        let bytes_per_operation = 16 * 8; // 16 u64 * 8 bytes

        let time = benchmark_operation(iterations, || [0xAAAAAAAAAAAAAAAAu64; 16], &mut |data| {
            clock_mix_avx2(data)
        });

        let total_bytes = (iterations * bytes_per_operation) as u128;
        let time_seconds = time.as_nanos() as f64 / 1_000_000_000.0;
        let throughput_bytes_per_sec = total_bytes as f64 / time_seconds;
        let throughput_mb_per_sec = throughput_bytes_per_sec / (1024.0 * 1024.0);

        // println! removed for no_std compatibility("Throughput test:");
        // println! removed for no_std compatibility("  Processed: {} bytes in {:?}", total_bytes, time);
        // println! removed for no_std compatibility("  Throughput: {:.2} MB/s", throughput_mb_per_sec);

        // Should process at least 1 MB/s (very conservative baseline)
        assert!(
            throughput_mb_per_sec > 1.0,
            "Throughput too low: {:.2} MB/s",
            throughput_mb_per_sec
        );
    }

    #[test]
    fn test_debug_performance_measurement() {
        #[cfg(feature = "debug")]
        {
            // Test the debug performance measurement utilities
            let data = [1u64, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];

            let result = validate_simd_vs_scalar(
                "performance_test",
                &data,
                |data| clock_mix_avx2(data),
                scalar_clock_mix,
            )
            .expect("Performance measurement should succeed");

            // println! removed for no_std compatibility("Debug performance measurement:");
            // println! removed for no_std compatibility("  Operation: {}", result.operation);
            // println! removed for no_std compatibility("  Execution time: {:?}", result.execution_time);
            // println! removed for no_std compatibility("  CPU features: {}", result.cpu_features);

            // Should have measured some time
            assert!(result.execution_time > Duration::new(0, 0));
            assert!(result.output == result.scalar_output);
        }
    }

    /// Scalar version of process_block_simd for testing
    fn process_block_simd_scalar(block: &[u8; 128], state: &mut [u64; 8]) {
        // Parse block to 16 u64 words (little-endian)
        let mut words = [0u64; 16];
        for i in 0..16 {
            let offset = i * 8;
            words[i] = u64::from_le_bytes([
                block[offset],
                block[offset + 1],
                block[offset + 2],
                block[offset + 3],
                block[offset + 4],
                block[offset + 5],
                block[offset + 6],
                block[offset + 7],
            ]);
        }

        // Apply ClockMix
        scalar_clock_mix(&mut words);

        // Inject into state
        for i in 0..8 {
            state[i] = state[i].wrapping_add(words[i]);
            let rot_idx = (i + 4) % 8;
            state[i] ^= crate::utils::rotl64(state[rot_idx], 17);
        }

        crate::clockpermute::clock_permute(state);
    }
}