#[test]
#[cfg(feature = "cuda")]
fn f056_fused_ffn_dp4a() {
let brick = FusedFfnBrick::with_packed_dp4a(64, 256);
assert!(brick.use_packed_dp4a, "DP4A should be enabled");
let brick_default = FusedFfnBrick::new(64, 256);
let _ = brick_default.use_packed_dp4a;
}
#[test]
#[cfg(feature = "cuda")]
fn f057_fused_ffn_assertions() {
let brick = FusedFfnBrick::new(64, 256);
let assertions = brick.assertions();
let has_shared_q8 = assertions.iter().any(|a| a.name == "shared_q8_quant");
let has_swiglu_fused = assertions.iter().any(|a| a.name == "swiglu_fused");
assert!(has_shared_q8, "Should have shared_q8_quant assertion");
assert!(has_swiglu_fused, "Should have swiglu_fused assertion");
}
#[test]
fn f058_activation_quant_bandwidth() {
let brick = ActivationQuantBrick::new(1024);
let reduction = brick.bandwidth_reduction();
assert!(
reduction > 3.5 && reduction < 4.0,
"Bandwidth reduction should be ~4x, got {:.2}x",
reduction
);
}
#[test]
fn f059_activation_quant_bytes_saved() {
let brick = ActivationQuantBrick::new(1024);
let saved = brick.bytes_saved();
assert_eq!(saved, 1024 * 3, "Should save 3 bytes per element");
}
#[test]
fn f060_activation_quant_error() {
let per_tensor = ActivationQuantBrick::new(1024);
let per_channel = ActivationQuantBrick::with_per_channel(1024);
assert!(
(per_tensor.estimated_error() - 0.001).abs() < 0.0001,
"Per-tensor error should be 0.1%"
);
assert!(
(per_channel.estimated_error() - 0.0005).abs() < 0.0001,
"Per-channel error should be 0.05%"
);
}
#[test]
fn f061_activation_quant_assertions() {
let brick = ActivationQuantBrick::new(1024);
let assertions = brick.assertions();
let has_symmetric = assertions.iter().any(|a| a.name == "symmetric_range");
let has_error_bound = assertions.iter().any(|a| a.name == "error_bound");
assert!(has_symmetric, "Should have symmetric_range assertion");
assert!(has_error_bound, "Should have error_bound assertion");
}
#[test]
fn f062_activation_quant_trait() {
let brick = ActivationQuantBrick::new(1024);
assert_eq!(brick.name(), "activation_quant");
assert!(brick.budget().us_per_token > 0.0);
assert!(brick.can_run());
let zero_brick = ActivationQuantBrick::new(0);
assert!(!zero_brick.can_run());
}
#[test]
fn f063_cuda_graph_capture_ready() {
let brick = CudaGraphBrick::new(28, 1536); assert_eq!(brick.name(), "cuda_graph");
assert!(brick.budget().us_per_token > 0.0);
assert!(!brick.captured, "Should start not captured");
assert!(
!brick.can_replay(),
"Should not be replayable until captured"
);
}
#[test]
fn f064_cuda_graph_replay_ready() {
let mut brick = CudaGraphBrick::new(28, 1536);
let assertions = brick.assertions();
let has_speedup = assertions.iter().any(|a| a.name == "graph_speedup");
assert!(has_speedup, "Should verify graph speedup");
brick.set_captured(true);
assert!(brick.can_replay(), "Should be replayable after capture");
assert!(brick.replay().is_ok(), "Replay should succeed");
}
#[test]
#[cfg(feature = "cuda")]
fn f065_indirect_kernel_ready() {
let brick = CoalescedDp4aBrick::new(1024, 256);
assert!(brick.can_run());
let assertions = brick.assertions();
let has_bandwidth = assertions.iter().any(|a| a.name == "bandwidth_efficient");
assert!(has_bandwidth, "Should verify bandwidth efficiency");
}
#[test]
#[cfg(feature = "cuda")]
fn f066_dp4a_instruction_ready() {
let brick = CoalescedDp4aBrick::new(1024, 256);
assert!(
brick.name().contains("dp4a"),
"Name should indicate DP4A usage"
);
assert!(brick.k.is_multiple_of(256), "K should align for DP4A");
}
#[test]
#[cfg(feature = "cuda")]
fn f067_memory_coalescing_ready() {
let brick = CoalescedDp4aBrick::new(1024, 256);
assert!(
brick.k.is_multiple_of(256) || brick.k < 256,
"K should align for coalescing"
);
}
#[test]
fn f070_register_usage_ready() {
let rms = RmsNormBrick::new(vec![1.0; 64], 1e-5);
let qkv = QkvBrick::new(64, 64, 64, 64);
let attn = AttentionBrick::new(8, 2, 64);
let ffn = FfnBrick::new(64, 256);
assert!(rms.budget().us_per_token > 0.0);
assert!(qkv.budget().us_per_token > 0.0);
assert!(attn.budget().us_per_token > 0.0);
assert!(ffn.budget().us_per_token > 0.0);
}
#[test]
fn f073_error_handling_ready() {
let invalid_err = BrickError::InvalidInput("test".to_string());
let budget_err = BrickError::BudgetExceeded {
limit_us: 10.0,
actual_us: 20.0,
};
assert!(!format!("{invalid_err}").is_empty());
assert!(!format!("{budget_err}").is_empty());
}
#[test]
fn f081_throughput_comparison_ready() {
let target_2x_llama = TokenBudget::from_throughput(976.0 * 2.0);
assert!(target_2x_llama.tokens_per_sec > 1900.0);
assert!(target_2x_llama.us_per_token < 520.0); }
#[test]
fn f085_cv_calculation_ready() {
let config = BenchmarkConfig::default();
assert!(config.samples >= 100, "Need >= 100 samples for valid CV");
}
#[test]
fn f086_latency_percentile_ready() {
let report = BenchmarkReport {
brick_name: "test".to_string(),
mean_us: 50.0,
std_us: 5.0,
cv: 10.0,
p50_us: 50.0,
p99_us: 58.0,
tokens_per_sec: 20000.0,
budget_us: 100.0,
budget_met: true,
statistically_valid: true,
};
assert!(report.p99_us < report.p50_us * 2.0);
}
#[test]
fn f087_baseline_comparison_ready() {
let budget = TokenBudget::from_latency(100.0);
let actual = 80.0;
let gap = budget.gap_factor(actual);
assert!(gap < 1.0, "Under budget should have gap < 1.0");
let over_budget = budget.gap_factor(120.0);
assert!(over_budget > 1.0, "Over budget should have gap > 1.0");
}
#[test]
fn f090_cuda_graph_overhead_ready() {
let brick = CudaGraphBrick::new(28, 1536);
assert!(
brick.budget().us_per_token < 100.0,
"Graph overhead should be < 100µs"
);
}
#[test]
fn f092_memory_usage_ready() {
let brick = ActivationQuantBrick::new(1024);
let bytes_saved = brick.bytes_saved();
assert!(bytes_saved > 0, "Should track memory savings");
let flash = FlashAttentionBrick::new(8, 2, 64);
let (naive, flash_mem) = flash.memory_bytes(512);
assert!(flash_mem < naive, "Flash should use less memory");
}
#[test]
fn f093_memory_leak_detection_ready() {
for _ in 0..100 {
let _ = RmsNormBrick::new(vec![1.0; 1024], 1e-5);
let _ = FfnBrick::new(1024, 4096);
let _ = AttentionBrick::new(32, 8, 128);
}
}
#[test]
#[allow(deprecated)] fn f094_graceful_degradation_ready() {
let brick = ActivationQuantBrick::new(0);
let result = brick.execute();
assert!(result.is_err(), "Zero-dim should fail gracefully");
if let Err(BrickError::InvalidInput(msg)) = result {
assert!(!msg.is_empty(), "Error should have message");
}
}
#[test]
fn r001_activation_quant_real_quantize() {
let brick = ActivationQuantBrick::new(64);
let input: Vec<f32> = (0..64).map(|i| (i as f32 - 32.0) / 10.0).collect();
let (quants, scales) = brick.quantize(&input).expect("operation failed");
assert_eq!(quants.len(), 64);
assert_eq!(scales.len(), 2); assert!(scales.iter().all(|&s| s > 0.0), "Scales must be positive");
}
#[test]
fn r002_activation_quant_roundtrip() {
let brick = ActivationQuantBrick::new(32);
let input: Vec<f32> = (0..32).map(|i| (i as f32 - 16.0) * 0.1).collect();
let (quants, scales) = brick.quantize(&input).expect("operation failed");
let output = brick
.dequantize(&quants, &scales)
.expect("operation failed");
let error = brick
.measure_error(&input, &quants, &scales)
.expect("operation failed");
assert!(error < 0.01, "Q8 error {} should be < 1%", error);
for (i, (&orig, &dequant)) in input.iter().zip(output.iter()).enumerate() {
let diff = (orig - dequant).abs();
assert!(diff < 0.05, "Value {} diff {} too large", i, diff);
}
}
#[test]
fn r003_flash_attention_real_forward() {
let brick = FlashAttentionBrick::new(4, 2, 8); let seq_len = 4;
let query = vec![1.0f32; 4 * 8]; let keys = vec![0.5f32; seq_len * 2 * 8]; let values = vec![0.25f32; seq_len * 2 * 8];
let output = brick
.forward(&query, &keys, &values, seq_len)
.expect("operation failed");
assert_eq!(output.len(), 4 * 8);
assert!(output.iter().all(|&v| !v.is_nan()), "No NaNs");
assert!(output.iter().all(|&v| v.is_finite()), "All finite");
}
#[test]
fn r004_flash_attention_softmax_correct() {
let brick = FlashAttentionBrick::new(1, 1, 4); let seq_len = 3;
let query = vec![1.0f32, 0.0, 0.0, 0.0];
let keys = vec![
1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, ];
let values = vec![
1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ];
let output = brick
.forward(&query, &keys, &values, seq_len)
.expect("operation failed");
assert!(output[0] > output[1], "V0 weight should be highest");
assert!(output[0] > output[2], "V0 weight should be highest");
}
#[test]
#[cfg(feature = "cuda")]
fn r005_coalesced_dp4a_real_gemv() {
let brick = CoalescedDp4aBrick::new(256, 4);
let input_q8 = vec![1i8; 256];
let input_scale = 1.0 / 127.0;
let weights_q4 = vec![0x88u8; 4 * 256 / 2];
let weight_scales = vec![0.1f32; 4];
let output = brick
.forward(&input_q8, input_scale, &weights_q4, &weight_scales)
.expect("operation failed");
assert_eq!(output.len(), 4);
assert!(output.iter().all(|&v| !v.is_nan()));
}