clock-hash 1.0.0

//! SIMD performance benchmarks for ClockHash-256
//!
//! This module provides comprehensive benchmarks comparing SIMD implementations
//! including AVX2 vs AVX-512 performance, CPU feature detection overhead,
//! and SIMD dispatch logic.

#[cfg(feature = "simd")]
use clock_hash::simd::{
    dispatch::{clock_mix_avx2, is_avx2_available, is_avx512_available, process_block_simd},
    scalar::scalar_clock_mix,
};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use std::hint::black_box;

/// Benchmark AVX2 vs scalar ClockMix performance
///
/// Compares AVX2-accelerated ClockMix against scalar implementation
/// to measure SIMD acceleration benefits.
#[cfg(feature = "simd")]
fn bench_clock_mix_simd_vs_scalar(c: &mut Criterion) {
    let mut group = c.benchmark_group("clock_mix_simd_vs_scalar");

    let iterations = [1000, 10000, 100000];

    for &iter in &iterations {
        // Scalar benchmark
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("scalar_{}_iterations", iter)),
            &iter,
            |b, &iter| {
                b.iter(|| {
                    let mut data = [0x123456789ABCDEF0u64; 16];
                    for _ in 0..iter {
                        scalar_clock_mix(black_box(&mut data));
                    }
                });
            },
        );

        // AVX2 benchmark (only if available)
        if is_avx2_available() {
            group.bench_with_input(
                BenchmarkId::from_parameter(format!("avx2_{}_iterations", iter)),
                &iter,
                |b, &iter| {
                    b.iter(|| {
                        let mut data = [0x123456789ABCDEF0u64; 16];
                        for _ in 0..iter {
                            clock_mix_avx2(black_box(&mut data));
                        }
                    });
                },
            );
        }
    }

    group.finish();
}

/// Benchmark AVX-512 vs AVX2 vs scalar performance
///
/// Comprehensive comparison of all SIMD implementations when available.
#[cfg(feature = "simd")]
fn bench_simd_implementations_comparison(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_implementations_comparison");

    let test_data = [
        ("zeros", [0u64; 16]),
        ("ones", [u64::MAX; 16]),
        ("pattern", {
            let mut data = [0u64; 16];
            for i in 0..16 {
                data[i] = (i as u64).wrapping_mul(0x9E3779B97F4A7C15);
            }
            data
        }),
    ];

    for (name, mut data) in test_data {
        // Scalar baseline
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("{}_scalar", name)),
            &data,
            |b, data| {
                b.iter(|| {
                    let mut local_data = *data;
                    scalar_clock_mix(black_box(&mut local_data));
                    black_box(local_data);
                });
            },
        );

        // AVX2 implementation
        if is_avx2_available() {
            group.bench_with_input(
                BenchmarkId::from_parameter(format!("{}_avx2", name)),
                &data,
                |b, data| {
                    b.iter(|| {
                        let mut local_data = *data;
                        clock_mix_avx2(black_box(&mut local_data));
                        black_box(local_data);
                    });
                },
            );
        }

        // AVX-512 implementation (when available)
        if is_avx512_available() {
            group.bench_with_input(
                BenchmarkId::from_parameter(format!("{}_avx512", name)),
                &data,
                |b, data| {
                    b.iter(|| {
                        let mut local_data = *data;
                        // Note: AVX-512 implementation would be called here
                        // For now, we use AVX2 as fallback since AVX-512 dispatch isn't exposed
                        clock_mix_avx2(black_box(&mut local_data));
                        black_box(local_data);
                    });
                },
            );
        }
    }

    group.finish();
}

/// Benchmark SIMD dispatch overhead
///
/// Measures the performance impact of CPU feature detection and dispatch logic.
#[cfg(feature = "simd")]
fn bench_simd_dispatch_overhead(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_dispatch_overhead");

    let iterations = [10000, 100000];

    for &iter in &iterations {
        // Feature detection overhead
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("feature_detection_{}_calls", iter)),
            &iter,
            |b, &iter| {
                b.iter(|| {
                    let mut avx2_available = false;
                    let mut avx512_available = false;
                    for _ in 0..iter {
                        avx2_available = is_avx2_available();
                        avx512_available = is_avx512_available();
                    }
                    black_box((avx2_available, avx512_available));
                });
            },
        );

        // Block processing dispatch
        let test_block = [0x42u8; 128];
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("block_dispatch_{}_blocks", iter)),
            &iter,
            |b, &iter| {
                b.iter(|| {
                    let mut state = clock_hash::constants::IV;
                    for _ in 0..iter {
                        process_block_simd(black_box(&test_block), black_box(&mut state));
                    }
                    black_box(state);
                });
            },
        );
    }

    group.finish();
}

/// Benchmark SIMD performance scaling with data size
///
/// Tests how SIMD performance scales with different input sizes
/// to understand when SIMD acceleration provides benefits.
#[cfg(feature = "simd")]
fn bench_simd_scaling(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_scaling");

    let sizes = [1, 10, 100, 1000, 10000]; // Number of blocks to process

    for &size in &sizes {
        let blocks: Vec<[u8; 128]> = (0..size)
            .map(|i| {
                let mut block = [0u8; 128];
                for j in 0..128 {
                    block[j] = ((i * 128 + j) % 256) as u8;
                }
                block
            })
            .collect();

        group.bench_with_input(
            BenchmarkId::from_parameter(format!("{}_blocks", size)),
            &blocks,
            |b, blocks| {
                b.iter(|| {
                    let mut state = clock_hash::constants::IV;
                    for block in blocks.iter() {
                        process_block_simd(black_box(block), black_box(&mut state));
                    }
                    black_box(state);
                });
            },
        );
    }

    group.finish();
}

/// Benchmark SIMD memory access patterns
///
/// Tests SIMD performance with different memory access patterns
/// to understand cache behavior and memory bandwidth effects.
#[cfg(feature = "simd")]
fn bench_simd_memory_patterns(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_memory_patterns");

    fn sequential_pattern(i: usize) -> u64 { i as u64 }
    fn random_pattern(i: usize) -> u64 { (i as u64).wrapping_mul(0x9E3779B97F4A7C15).rotate_left(7) }
    fn sparse_pattern(i: usize) -> u64 { if i % 3 == 0 { i as u64 } else { 0 } }
    fn dense_pattern(i: usize) -> u64 { u64::MAX ^ (i as u64) }
    fn alternating_pattern(i: usize) -> u64 { if i % 2 == 0 { 0 } else { u64::MAX } }

    let patterns = [
        ("sequential", sequential_pattern as fn(usize) -> u64),
        ("random", random_pattern as fn(usize) -> u64),
        ("sparse", sparse_pattern as fn(usize) -> u64),
        ("dense", dense_pattern as fn(usize) -> u64),
        ("alternating", alternating_pattern as fn(usize) -> u64),
    ];

    for (name, pattern_fn) in &patterns {
        group.bench_with_input(
            BenchmarkId::from_parameter(*name),
            pattern_fn,
            |b, pattern_fn| {
                b.iter(|| {
                    let mut data = [0u64; 16];
                    for i in 0..16 {
                        data[i] = pattern_fn(i);
                    }

                    if is_avx2_available() {
                        clock_mix_avx2(black_box(&mut data));
                    } else {
                        scalar_clock_mix(black_box(&mut data));
                    }

                    black_box(data);
                });
            },
        );
    }

    group.finish();
}

/// Benchmark SIMD throughput for large data sets
///
/// Measures sustained SIMD throughput for bulk hashing operations.
#[cfg(feature = "simd")]
fn bench_simd_throughput(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_throughput");
    group.throughput(criterion::Throughput::Elements(1048576)); // 1MB

    let data_size = 1024 * 1024; // 1MB
    let data = vec![0x42u8; data_size];

    group.bench_function("1MB_bulk_hash_simd", |b| {
        b.iter(|| {
            // Process data in 128-byte blocks using SIMD dispatch
            let mut state = clock_hash::constants::IV;
            let mut remaining = &data[..];

            while remaining.len() >= 128 {
                let (block, rest) = remaining.split_at(128);
                let block_array: [u8; 128] = block.try_into().unwrap();
                process_block_simd(black_box(&block_array), black_box(&mut state));
                remaining = rest;
            }

            black_box(state);
        });
    });

    group.finish();
}

/// Benchmark SIMD warm-up and initialization overhead
///
/// Measures the overhead of SIMD initialization and first-use latency.
#[cfg(feature = "simd")]
fn bench_simd_initialization(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_initialization");

    group.bench_function("first_simd_call", |b| {
        b.iter(|| {
            let mut data = [0xDEADBEEFDEADBEEFu64; 16];
            if is_avx2_available() {
                clock_mix_avx2(black_box(&mut data));
            } else {
                scalar_clock_mix(black_box(&mut data));
            }
            black_box(data);
        });
    });

    group.bench_function("cpu_feature_detection", |b| {
        b.iter(|| {
            let avx2 = is_avx2_available();
            let avx512 = is_avx512_available();
            black_box((avx2, avx512));
        });
    });

    group.finish();
}

/// Benchmark SIMD fallback behavior
///
/// Tests performance when SIMD is available but disabled or unavailable.
#[cfg(feature = "simd")]
fn bench_simd_fallback(c: &mut Criterion) {
    let mut group = c.benchmark_group("simd_fallback");

    let iterations = [1000, 10000];

    for &iter in &iterations {
        // Force scalar path (simulate no SIMD available)
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("forced_scalar_{}_iterations", iter)),
            &iter,
            |b, &iter| {
                b.iter(|| {
                    let mut data = [0xAAAAAAAAAAAAAAAAu64; 16];
                    for _ in 0..iter {
                        scalar_clock_mix(black_box(&mut data));
                    }
                    black_box(data);
                });
            },
        );

        // Automatic dispatch (uses SIMD if available)
        group.bench_with_input(
            BenchmarkId::from_parameter(format!("auto_dispatch_{}_iterations", iter)),
            &iter,
            |b, &iter| {
                b.iter(|| {
                    let mut data = [0xAAAAAAAAAAAAAAAAu64; 16];
                    for _ in 0..iter {
                        if is_avx2_available() {
                            clock_mix_avx2(black_box(&mut data));
                        } else {
                            scalar_clock_mix(black_box(&mut data));
                        }
                    }
                    black_box(data);
                });
            },
        );
    }

    group.finish();
}

// Only include SIMD benchmarks when SIMD feature is available
#[cfg(feature = "simd")]
criterion_group!(
    benches,
    bench_clock_mix_simd_vs_scalar,
    bench_simd_implementations_comparison,
    bench_simd_dispatch_overhead,
    bench_simd_scaling,
    bench_simd_memory_patterns,
    bench_simd_throughput,
    bench_simd_initialization,
    bench_simd_fallback,
);

#[cfg(feature = "simd")]
criterion_main!(benches);

// When SIMD is not available, provide a stub main that informs the user
#[cfg(not(feature = "simd"))]
fn main() {
    eprintln!("SIMD benchmarks require the 'simd' feature to be enabled.");
    eprintln!("Run with: cargo bench --bench simd_bench --features simd");
    std::process::exit(1);
}