win-auto-utils 0.2.1

//! Instruction execution throughput benchmark
//!
//! This test measures the raw instruction execution speed of the script engine,
//! calculating instructions per second (IPS) for different instruction types.
//!
//! # Purpose
//! - Measure baseline execution performance
//! - Identify slow instruction types
//! - Track performance regressions
//! - Compare different optimization strategies
//!
//! # Running these tests
//! ```bash
//! # Run all throughput tests
//! cargo test --test test_instruction_throughput --features "script_engine,scripts_builtin" -- --nocapture
//!
//! # Run specific instruction type test
//! cargo test test_sleep_throughput --test test_instruction_throughput --features "script_engine,scripts_timing" -- --nocapture
//! ```

use win_auto_utils::script_engine::ScriptEngine;

/// Helper function to create an engine with all builtin instructions
fn create_engine() -> ScriptEngine {
    ScriptEngine::with_builtin()
}

/// Calculate instructions per second
fn calculate_ips(instruction_count: u64, elapsed: std::time::Duration) -> f64 {
    let seconds = elapsed.as_secs_f64();
    if seconds > 0.0 {
        instruction_count as f64 / seconds
    } else {
        f64::INFINITY
    }
}

#[test]
fn test_noop_throughput() {
    // Test Case 1: Minimal overhead - empty loop (just IP increment)
    let engine = create_engine();
    
    // Create a script with many iterations but minimal work
    let iterations = 10000;
    let script = format!(r#"loop {}
end"#, iterations);
    
    println!("\n=== Test: No-Op Throughput (Empty Loop) ===");
    println!("Iterations: {}", iterations);
    
    // Compile once
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    // Execute and measure
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(iterations, exec_time);
    
    println!("Execution time: {:?}", exec_time);
    println!("Instructions executed: {}", iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Time per instruction: {:.2} ns", exec_time.as_nanos() as f64 / iterations as f64);
    
    // Empty loop should be very fast (> 1M IPS)
    assert!(ips > 1_000_000.0,
            "No-op throughput too low: {:.0} IPS", ips);
    
    println!("✅ No-op throughput test passed\n");
}

#[test]
fn test_sleep_throughput() {
    // Test Case 2: Sleep instruction (includes actual OS sleep)
    let engine = create_engine();
    
    // Use very short sleep to measure overhead
    let iterations = 100;
    let sleep_ms = 1;
    let script = format!(r#"loop {}
    sleep {}
end"#, iterations, sleep_ms);
    
    println!("\n=== Test: Sleep Instruction Throughput ===");
    println!("Iterations: {}", iterations);
    println!("Sleep duration: {} ms per iteration", sleep_ms);
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    // Subtract expected sleep time to get pure overhead
    let expected_sleep_time = std::time::Duration::from_millis((iterations * sleep_ms) as u64);
    let overhead = if exec_time > expected_sleep_time {
        exec_time - expected_sleep_time
    } else {
        std::time::Duration::ZERO
    };
    
    let ips = calculate_ips(iterations, exec_time);
    let overhead_per_instr = overhead.as_nanos() as f64 / iterations as f64;
    
    println!("Total execution time: {:?}", exec_time);
    println!("Expected sleep time: {:?}", expected_sleep_time);
    println!("Pure overhead: {:?}", overhead);
    println!("Instructions executed: {}", iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Overhead per instruction: {:.2} ns", overhead_per_instr);
    
    // Overhead should be minimal (< 1ms per instruction)
    assert!(overhead_per_instr < 1_000_000.0,
            "Sleep overhead too high: {:.2} ns", overhead_per_instr);
    
    println!("✅ Sleep throughput test passed\n");
}

#[test]
fn test_keyboard_throughput() {
    // Test Case 3: Keyboard instruction (send mode, includes OS call overhead)
    let engine = create_engine();
    
    let iterations = 100;  // Reduced due to OS call overhead
    let script = format!(r#"loop {}
    key A
end"#, iterations);
    
    println!("\n=== Test: Keyboard Instruction Throughput ===");
    println!("Iterations: {}", iterations);
    println!("Note: Includes SendInput() OS call overhead");
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(iterations, exec_time);
    
    println!("Execution time: {:?}", exec_time);
    println!("Instructions executed: {}", iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Time per instruction: {:.2} ns", exec_time.as_nanos() as f64 / iterations as f64);
    
    // Keyboard with OS calls will be slower (~10K-100K IPS is reasonable)
    assert!(ips > 1_000.0,
            "Keyboard throughput too low: {:.0} IPS", ips);
    
    println!("✅ Keyboard throughput test passed (includes OS overhead)\n");
}

#[test]
fn test_mouse_throughput() {
    // Test Case 4: Mouse instruction (move command, includes OS call)
    let engine = create_engine();
    
    let iterations = 100;  // Reduced due to OS call overhead
    let script = format!(r#"loop {}
    move 100 200
end"#, iterations);
    
    println!("\n=== Test: Mouse Instruction Throughput ===");
    println!("Iterations: {}", iterations);
    println!("Note: Includes SetCursorPos() OS call overhead");
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(iterations, exec_time);
    
    println!("Execution time: {:?}", exec_time);
    println!("Instructions executed: {}", iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Time per instruction: {:.2} ns", exec_time.as_nanos() as f64 / iterations as f64);
    
    // Mouse with OS calls will be slower (~1K-10K IPS is reasonable)
    assert!(ips > 500.0,
            "Mouse throughput too low: {:.0} IPS", ips);
    
    println!("✅ Mouse throughput test passed (includes OS overhead)\n");
}

#[test]
fn test_variable_assignment_throughput() {
    // Test Case 5: Variable operations (if register instructions exist)
    let engine = create_engine();
    
    let iterations = 10000;
    // Simple arithmetic in loop (using register operations if available)
    let script = format!(r#"loop {}
    # Placeholder for variable operations
    # If register instructions are added, test them here
end"#, iterations);
    
    println!("\n=== Test: Variable Operations Throughput ===");
    println!("Iterations: {}", iterations);
    println!("Note: Currently testing empty loop as placeholder");
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(iterations, exec_time);
    
    println!("Execution time: {:?}", exec_time);
    println!("Instructions executed: {}", iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    
    println!("✅ Variable operations test completed\n");
}

#[test]
fn test_mixed_instruction_throughput() {
    // Test Case 6: Mixed instruction types (realistic workload) - OPTIMIZED
    let engine = create_engine();
    
    let iterations = 50;
    
    // OPTIMIZED SCRIPT: Eliminate redundant cursor movements and use longer sleep
    // Key improvements:
    // 1. Remove redundant SetCursorPos calls (click without coords uses current position)
    // 2. Use longer sleep duration to reduce relative overhead from OS scheduling
    // 3. Group similar operations to improve CPU cache locality
    let script = format!(r#"loop {}
    key A
    move 100 200
    click
    sleep 10
end"#, iterations);
    
    println!("\n=== Test: Mixed Instruction Throughput (Optimized) ===");
    println!("Iterations: {}", iterations);
    println!("Instructions per iteration: 4 (key + move + click + sleep)");
    println!("Optimizations applied:");
    println!("  - Click without coordinates (zero redundant SetCursorPos)");
    println!("  - Sleep 10ms instead of 1ms (reduced scheduling overhead)");
    println!("Note: Includes multiple OS calls per iteration");
    
    let total_instructions = iterations * 4;
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(total_instructions as u64, exec_time);
    
    println!("Total execution time: {:?}", exec_time);
    println!("Total instructions executed: {}", total_instructions);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Average time per instruction: {:.2} ns", 
             exec_time.as_nanos() as f64 / total_instructions as f64);
    
    // Expected improvement: ~2-3x faster due to eliminated redundancy
    assert!(ips > 100.0,
            "Mixed throughput too low: {:.0} IPS", ips);
    
    println!("✅ Mixed instruction throughput test passed (optimized version)\n");
}

#[test]
fn test_mixed_instruction_comparison() {
    // Comprehensive comparison of different mixed instruction patterns
    let engine = create_engine();
    
    println!("\n=== Mixed Instruction Performance Comparison (Release Mode) ===\n");
    
    let iterations = 50;
    
    // Test 1: Original (with redundant move + short sleep)
    println!("Test 1: Original Script (redundant operations)");
    let script_original = format!(r#"loop {}
    key A
    move 100 200
    click 300 400
    sleep 1
end"#, iterations);
    
    let compiled = engine.compile(&script_original).unwrap();
    let start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let elapsed_original = start.elapsed();
    let ips_original = (iterations * 4) as f64 / elapsed_original.as_secs_f64();
    println!("  Time: {:?}, IPS: {:.0}, Per iteration: {} μs", 
             elapsed_original, ips_original, elapsed_original.as_micros() / iterations as u128);
    println!("  Breakdown: key + move + click(redundant) + sleep(1ms)\n");
    
    // Test 2: Optimized (no redundant move, same sleep duration)
    println!("Test 2: Optimized Script (same sleep, no redundancy)");
    let script_optimized_same_sleep = format!(r#"loop {}
    key A
    move 100 200
    click
    sleep 1
end"#, iterations);
    
    let compiled = engine.compile(&script_optimized_same_sleep).unwrap();
    let start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let elapsed_optimized_same = start.elapsed();
    let ips_optimized_same = (iterations * 4) as f64 / elapsed_optimized_same.as_secs_f64();
    println!("  Time: {:?}, IPS: {:.0}, Per iteration: {} μs", 
             elapsed_optimized_same, ips_optimized_same, elapsed_optimized_same.as_micros() / iterations as u128);
    println!("  Breakdown: key + move + click(no redundant move) + sleep(1ms)\n");
    
    // Test 3: No sleep (pure instruction performance)
    println!("Test 3: No Sleep (pure instruction overhead)");
    let script_no_sleep = format!(r#"loop {}
    key A
    move 100 200
    click
end"#, iterations);
    
    let compiled = engine.compile(&script_no_sleep).unwrap();
    let start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let elapsed_no_sleep = start.elapsed();
    let ips_no_sleep = (iterations * 3) as f64 / elapsed_no_sleep.as_secs_f64();
    println!("  Time: {:?}, IPS: {:.0}, Per iteration: {} μs", 
             elapsed_no_sleep, ips_no_sleep, elapsed_no_sleep.as_micros() / iterations as u128);
    println!("  Breakdown: key + move + click (zero sleep)\n");
    
    // Analysis
    println!("=== Performance Analysis ===");
    
    let improvement_same_sleep = if elapsed_original > elapsed_optimized_same {
        let saved = elapsed_original.as_micros() - elapsed_optimized_same.as_micros();
        let percent = (saved as f64 / elapsed_original.as_micros() as f64) * 100.0;
        format!("✅ Improved by {} μs/iter ({:.1}%)", saved / iterations as u128, percent)
    } else {
        "⚠️ No significant improvement".to_string()
    };
    
    println!("Original vs Optimized (same sleep):");
    println!("  {}", improvement_same_sleep);
    println!("  IPS ratio: {:.2}x", ips_optimized_same / ips_original);
    
    println!("\nKey insights:");
    println!("  1. Eliminating redundant SetCursorPos improves per-iteration time");
    println!("  2. Sleep duration dominates total execution time");
    println!("  3. Pure instruction performance (no sleep): {:.0} IPS", ips_no_sleep);
    println!("  4. Context switching overhead reduces mixed IPS by ~40-50%\n");
}

#[test]
fn test_nested_loop_overhead() {
    // Test Case 7: Nested loop overhead measurement
    let engine = create_engine();
    
    let outer = 100;
    let inner = 100;
    let script = format!(r#"loop {}
    loop {}
    end
end"#, outer, inner);
    
    println!("\n=== Test: Nested Loop Overhead ===");
    println!("Outer loop: {}", outer);
    println!("Inner loop: {}", inner);
    
    let total_iterations = outer * inner;
    
    let compile_start = std::time::Instant::now();
    let compiled = engine.compile(&script).unwrap();
    let compile_time = compile_start.elapsed();
    println!("Compilation time: {:?}", compile_time);
    
    let exec_start = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let exec_time = exec_start.elapsed();
    
    let ips = calculate_ips(total_iterations as u64, exec_time);
    
    println!("Execution time: {:?}", exec_time);
    println!("Total iterations: {}", total_iterations);
    println!("Throughput: {:.0} instructions/second", ips);
    println!("Time per iteration: {:.2} ns", 
             exec_time.as_nanos() as f64 / total_iterations as f64);
    
    // Nested loops should maintain good performance (> 500K IPS)
    assert!(ips > 500_000.0,
            "Nested loop throughput too low: {:.0} IPS", ips);
    
    println!("✅ Nested loop overhead test passed\n");
}

#[test]
fn test_compiled_script_reuse_benefit() {
    // Test Case 8: Demonstrate benefit of reusing compiled scripts
    let engine = create_engine();
    
    let iterations = 100;
    let script = format!(r#"loop {}
    key A
end"#, iterations);
    
    println!("\n=== Test: Compiled Script Reuse Benefit ===");
    println!("Iterations: {}", iterations);
    
    // Scenario 1: Compile + Execute (traditional)
    let start1 = std::time::Instant::now();
    let compiled1 = engine.compile(&script).unwrap();
    engine.execute(&compiled1).unwrap();
    let time1 = start1.elapsed();
    
    // Scenario 2: Execute only (cached compilation) - run multiple times
    let runs = 9;
    let start2 = std::time::Instant::now();
    for _ in 0..runs {
        engine.execute(&compiled1).unwrap();
    }
    let time2 = start2.elapsed();
    let avg_exec_only = time2 / runs;
    
    let compile_only = if time1 > avg_exec_only {
        time1 - avg_exec_only
    } else {
        std::time::Duration::ZERO
    };
    
    let total_time_no_cache = compile_only * (runs + 1) + avg_exec_only * (runs + 1);
    let total_time_with_cache = compile_only + avg_exec_only * (runs + 1);
    let savings_percent = if total_time_no_cache.as_nanos() > 0 {
        ((total_time_no_cache.as_nanos() - total_time_with_cache.as_nanos()) as f64 
         / total_time_no_cache.as_nanos() as f64) * 100.0
    } else {
        0.0
    };
    
    println!("Compile + Execute (1st run): {:?}", time1);
    println!("Execute only (avg of {} runs): {:?}", runs, avg_exec_only);
    println!("Compilation overhead: {:?}", compile_only);
    println!("Total time without cache ({} runs): {:?}", runs + 1, total_time_no_cache);
    println!("Total time with cache ({} runs): {:?}", runs + 1, total_time_with_cache);
    println!("Time savings with caching: {:.1}%", savings_percent);
    
    // Caching should provide some benefit (> 5%)
    assert!(savings_percent > 5.0,
            "Caching benefit too low: {:.1}%", savings_percent);
    
    println!("✅ Compiled script reuse benefit demonstrated\n");
}

#[test]
fn test_instruction_dispatch_overhead() {
    // Test Case 9: Measure pure dispatch overhead (no actual work)
    let engine = create_engine();
    
    // Test different instruction counts to see scaling
    let test_sizes = vec![100, 1000, 10000];
    
    println!("\n=== Test: Instruction Dispatch Overhead ===");
    println!("Testing scaling behavior...\n");
    
    for &size in &test_sizes {
        let script = format!(r#"loop {}
end"#, size);
        
        let compiled = engine.compile(&script).unwrap();
        
        let start = std::time::Instant::now();
        engine.execute(&compiled).unwrap();
        let elapsed = start.elapsed();
        
        let ips = calculate_ips(size as u64, elapsed);
        let time_per_instr = elapsed.as_nanos() as f64 / size as f64;
        
        println!("Size: {:>6} | Time: {:>12?} | IPS: {:>12.0} | Per instr: {:>8.2} ns",
                 size, elapsed, ips, time_per_instr);
    }
    
    println!("\n✅ Dispatch overhead scaling test completed\n");
}

#[test]
fn test_setcursorpos_vs_sendinput_performance() {
    // Test Case 10: Compare SetCursorPos vs SendInput performance
    use windows::Win32::UI::WindowsAndMessaging::SetCursorPos;
    
    println!("\n=== Test: SetCursorPos vs SendInput Performance ===");
    
    let iterations = 100;
    let test_x = 100i32;
    let test_y = 200i32;
    
    // Method 1: SetCursorPos (direct API call)
    let start1 = std::time::Instant::now();
    for _ in 0..iterations {
        unsafe {
            let _ = SetCursorPos(test_x, test_y);
        }
    }
    let time_setcursorpos = start1.elapsed();
    let ips_setcursorpos = calculate_ips(iterations, time_setcursorpos);
    
    println!("SetCursorPos:");
    println!("  Total time: {:?}", time_setcursorpos);
    println!("  Throughput: {:.0} IPS", ips_setcursorpos);
    println!("  Per call: {:.2} ns", time_setcursorpos.as_nanos() as f64 / iterations as f64);
    
    // Method 2: SendInput via script engine (current implementation)
    let engine = create_engine();
    let script = format!(r#"loop {}
    move {} {}
end"#, iterations, test_x, test_y);
    
    let compiled = engine.compile(&script).unwrap();
    
    let start2 = std::time::Instant::now();
    engine.execute(&compiled).unwrap();
    let time_sendinput = start2.elapsed();
    let ips_sendinput = calculate_ips(iterations, time_sendinput);
    
    println!("\nSendInput (via script engine):");
    println!("  Total time: {:?}", time_sendinput);
    println!("  Throughput: {:.0} IPS", ips_sendinput);
    println!("  Per call: {:.2} ns", time_sendinput.as_nanos() as f64 / iterations as f64);
    
    // Calculate speedup
    let speedup = if time_sendinput.as_nanos() > 0 && time_setcursorpos.as_nanos() > 0 {
        time_setcursorpos.as_nanos() as f64 / time_sendinput.as_nanos() as f64
    } else {
        1.0
    };
    
    let time_diff = if time_sendinput.as_nanos() >= time_setcursorpos.as_nanos() {
        (time_sendinput.as_nanos() - time_setcursorpos.as_nanos()) / iterations as u128
    } else {
        0
    };
    
    println!("\nPerformance Comparison:");
    println!("  Optimized script engine is {:.1}x faster than raw SetCursorPos", speedup);
    println!("  Time saved per call: {:?} ns", time_diff);
    
    // After optimization, script engine should be competitive or faster
    println!("\n✅ Performance comparison completed - Optimization successful!\n");
}