#[test]
fn test_imp_1000d_max_throughput_preset() {
let hints = PtxOptimizationHints::max_throughput();
assert_eq!(hints.memory_pattern, MemoryPattern::Vector4);
assert_eq!(hints.register_tiling.width, 8);
assert_eq!(hints.register_tiling.height, 8);
assert_eq!(hints.bank_conflict_strategy, BankConflictStrategy::Padding);
assert!(hints.enable_ilp);
assert!(hints.uses_vectorized_loads());
assert_eq!(hints.vector_width(), 4);
assert_eq!(hints.shared_mem_padding(), 1);
}
#[test]
fn test_imp_1000d_register_tiling() {
let large = RegisterTiling::large();
assert_eq!(large.width, 8);
assert_eq!(large.height, 8);
assert_eq!(large.registers_needed(), 64);
let medium = RegisterTiling::medium();
assert_eq!(medium.registers_needed(), 16);
let small = RegisterTiling::small();
assert_eq!(small.registers_needed(), 4);
}
#[test]
fn test_imp_1000d_ptx_optimizer() {
let hints = PtxOptimizationHints::max_throughput();
let optimizer = PtxOptimizer::new(hints);
let summary = optimizer.summary();
assert!(summary.contains("vec=4"), "Expected vec=4 in: {}", summary);
assert!(summary.contains("8x8"), "Expected 8x8 in: {}", summary);
assert!(
summary.contains("ilp=true"),
"Expected ilp=true in: {}",
summary
);
assert_eq!(optimizer.estimated_registers(), 144);
assert!(optimizer.is_high_register_pressure());
assert_eq!(optimizer.padded_shared_mem_row(32), 33);
}
#[test]
fn test_imp_1000d_low_latency_preset() {
let hints = PtxOptimizationHints::low_latency();
let optimizer = PtxOptimizer::new(hints);
assert!(!optimizer.hints().uses_vectorized_loads());
assert_eq!(optimizer.hints().vector_width(), 1);
assert!(!optimizer.hints().enable_ilp);
assert_eq!(optimizer.estimated_registers(), 20);
assert!(!optimizer.is_high_register_pressure());
}
#[test]
fn test_imp_1000d_bank_conflict_strategies() {
let mut hints = PtxOptimizationHints::default();
hints.bank_conflict_strategy = BankConflictStrategy::None;
assert_eq!(hints.shared_mem_padding(), 0);
hints.bank_conflict_strategy = BankConflictStrategy::Padding;
assert_eq!(hints.shared_mem_padding(), 1);
hints.bank_conflict_strategy = BankConflictStrategy::Xor;
assert_eq!(hints.shared_mem_padding(), 0);
}
#[test]
fn test_imp_800d_stress_runner_config() {
use trueno_gpu::testing::{PerformanceThresholds, StressConfig, StressTestRunner};
let config = StressConfig {
cycles: 10,
interval_ms: 0, seed: 42,
min_input_size: 64,
max_input_size: 256,
thresholds: PerformanceThresholds {
max_frame_time_ms: 100,
max_memory_bytes: 64 * 1024 * 1024,
max_timing_variance: 0.5,
max_failure_rate: 0.01,
},
};
let runner = StressTestRunner::new(config.clone());
let report = runner.report();
assert_eq!(report.cycles_completed, 0);
assert!(report.frames.is_empty());
assert_eq!(config.seed, 42);
}
#[test]
fn test_imp_800d_performance_verification() {
use trueno_gpu::testing::{
verify_performance, FrameProfile, PerformanceThresholds, StressReport,
};
let mut report = StressReport::default();
for i in 0..10 {
report.add_frame(FrameProfile {
cycle: i,
duration_ms: 20 + i as u64 * 2, memory_bytes: 1024,
tests_passed: 1,
tests_failed: 0,
input_seed: i as u64,
input_size: 64,
});
}
let thresholds_pass = PerformanceThresholds {
max_frame_time_ms: 50,
max_memory_bytes: 64 * 1024 * 1024,
max_timing_variance: 0.5,
max_failure_rate: 0.01,
};
let result = verify_performance(&report, &thresholds_pass);
assert!(result.passed, "Should pass: {:?}", result.violations);
assert_eq!(result.max_frame_ms, 38);
assert!(result.violations.is_empty());
let thresholds_fail = PerformanceThresholds {
max_frame_time_ms: 30, max_memory_bytes: 64 * 1024 * 1024,
max_timing_variance: 0.5,
max_failure_rate: 0.01,
};
let result_fail = verify_performance(&report, &thresholds_fail);
assert!(!result_fail.passed, "Should fail due to max frame time");
assert!(!result_fail.violations.is_empty());
}
#[test]
fn test_imp_800d_tui_output() {
use trueno_gpu::testing::{
render_to_string, FrameProfile, PerformanceResult, StressReport, TuiState,
};
let mut state = TuiState::new(100);
let mut report = StressReport::default();
for i in 0..20 {
report.add_frame(FrameProfile {
cycle: i,
duration_ms: 30 + (i % 5) as u64 * 3, memory_bytes: 1024 * 1024, tests_passed: 5,
tests_failed: 0,
input_seed: i as u64,
input_size: 128,
});
}
state.update_from_report(&report);
let perf = PerformanceResult {
passed: true,
max_frame_ms: 42,
mean_frame_ms: 36.0,
variance: 0.1,
pass_rate: 1.0,
violations: vec![],
};
let output = render_to_string(&state, &report, &perf);
assert!(output.contains("Stress Test Monitor"), "Missing header");
assert!(output.contains("Cycle:"), "Missing cycle info");
assert!(output.contains("FPS:"), "Missing FPS");
assert!(output.contains("PASS"), "Missing status");
assert!(output.contains("Mean:"), "Missing mean");
}
#[test]
fn test_imp_800d_deterministic_output() {
use trueno_gpu::testing::{StressConfig, StressRng, StressTestRunner};
let seed = 12345u64;
let mut rng1 = StressRng::new(seed);
let mut rng2 = StressRng::new(seed);
let seq1: Vec<u32> = (0..100).map(|_| rng1.next_u32()).collect();
let seq2: Vec<u32> = (0..100).map(|_| rng2.next_u32()).collect();
assert_eq!(seq1, seq2, "Same seed must produce identical sequences");
let config = StressConfig {
cycles: 5,
seed,
..StressConfig::default()
};
let mut runner1 = StressTestRunner::new(config.clone());
let mut runner2 = StressTestRunner::new(config);
for _ in 0..5 {
let (seed1, input1) = runner1.generate_input();
let (seed2, input2) = runner2.generate_input();
assert_eq!(seed1, seed2, "Seeds must match");
assert_eq!(
input1, input2,
"Inputs must match for deterministic testing"
);
}
}
#[test]
#[serial]
fn test_imp_800d_stress_runner_gpu() {
use trueno_gpu::testing::{
verify_performance, PerformanceThresholds, StressConfig, StressTestRunner,
};
if !has_cuda() {
return;
}
let _context = CudaContext::new(0).expect("test");
let kernels = CudaKernels::new();
let config = StressConfig {
cycles: 20,
interval_ms: 0,
seed: 42,
min_input_size: 128,
max_input_size: 512,
thresholds: PerformanceThresholds {
max_frame_time_ms: 100, max_memory_bytes: 64 * 1024 * 1024,
max_timing_variance: 0.5,
max_failure_rate: 0.01,
},
};
let mut runner = StressTestRunner::new(config.clone());
let report = runner.run_all(|input| {
let _ptx = kernels.generate_ptx(&KernelType::Softmax {
dim: input.len() as u32,
});
(1, 0) });
let result = verify_performance(report, &config.thresholds);
assert!(
result.passed,
"GPU stress test failed: {:?}",
result.violations
);
}
#[test]
fn test_imp_900a_optimized_gemm_kernel() {
let kernels = CudaKernels::new();
let kernel = KernelType::GemmTiled {
m: 32,
n: 4096,
k: 4096,
tile_size: 32,
};
let ptx = kernels.generate_ptx(&kernel);
assert!(ptx.contains(".version"), "IMP-900a: PTX version header");
assert!(ptx.contains("gemm"), "IMP-900a: Kernel function name");
assert!(
ptx.contains(".shared"),
"IMP-900a: Shared memory for tiling"
);
}
#[test]
fn test_imp_900a_gemm_performance_characteristics() {
let tile_size = 32;
let m = 32;
let n = 4096;
let k = 4096;
let flops = 2 * m * n * k;
let input_a = m * k * 4; let input_b = k * n * 4;
let output_c = m * n * 4;
let total_memory = input_a + input_b + output_c;
let arithmetic_intensity = flops as f64 / total_memory as f64;
println!("IMP-900a: GEMM Performance Characteristics");
println!(" Dimensions: {}x{}x{}", m, n, k);
println!(" Tile size: {}", tile_size);
println!(" FLOPS: {:.2} GFLOPS", flops as f64 / 1e9);
println!(" Memory: {:.2} MB", total_memory as f64 / 1e6);
println!(
" Arithmetic Intensity: {:.2} FLOPS/byte",
arithmetic_intensity
);
assert!(
arithmetic_intensity > 10.0,
"IMP-900a: GEMM should be compute-bound (>10 FLOPS/byte)"
);
}
#[test]
fn test_imp_900b_kernel_fusion_infrastructure() {
let kernels = CudaKernels::new();
let fused_kernel = KernelType::QuantizedGemm {
m: 1,
n: 4096,
k: 4096,
};
let name = kernels.kernel_name(&fused_kernel);
assert_eq!(name, "q4k_gemm_fused", "IMP-900b: Fused kernel name");
let ptx = kernels.generate_ptx(&fused_kernel);
assert!(
ptx.contains("q4k_gemm_fused"),
"IMP-900b: Fused kernel in PTX"
);
}
#[test]
fn test_imp_900b_kernel_fusion_types() {
let fused_kernels = [
("q4k_gemm_fused", "Q4_K dequantize + GEMM"),
("attention_softmax_fused", "QK matmul + softmax"),
("gelu_add_fused", "GELU activation + residual add"),
];
for (name, description) in fused_kernels {
println!("IMP-900b: {} - {}", name, description);
}
assert_eq!(fused_kernels.len(), 3, "IMP-900b: 3 fused kernel types");
}
#[test]
fn test_imp_900c_flash_attention_config() {
let seq_len = 1024;
let head_dim = 64;
let n_heads = 32;
let standard_memory = seq_len * seq_len * 4;
let block_size = 64;
let flash_memory = 2 * block_size * head_dim * 4;
let memory_reduction = standard_memory as f64 / flash_memory as f64;
println!("IMP-900c: FlashAttention Memory Analysis");
println!(" Sequence length: {}", seq_len);
println!(" Head dimension: {}", head_dim);
println!(" Num heads: {}", n_heads);
println!(" Standard memory: {:.2} MB", standard_memory as f64 / 1e6);
println!(
" FlashAttention memory: {:.2} KB",
flash_memory as f64 / 1e3
);
println!(" Memory reduction: {:.0}x", memory_reduction);
assert!(
memory_reduction > 100.0,
"IMP-900c: FlashAttention should reduce memory >100x at seq_len=1024"
);
}