#[test]
#[cfg(feature = "cuda")]
fn r006_fused_ffn_real_swiglu() {
let brick = FusedFfnBrick::new(4, 8);
let input = vec![1.0f32; 4];
let gate_proj = vec![0.1f32; 8 * 4]; let up_proj = vec![0.2f32; 8 * 4];
let down_proj = vec![0.1f32; 4 * 8];
let output = brick
.forward(&input, &gate_proj, &up_proj, &down_proj)
.expect("operation failed");
assert_eq!(output.len(), 4);
assert!(output.iter().all(|&v| !v.is_nan()), "No NaNs");
assert!(output.iter().all(|&v| v.is_finite()), "All finite");
}
#[test]
#[cfg(feature = "cuda")]
fn r007_fused_ffn_swiglu_activation() {
let brick = FusedFfnBrick::new(1, 1);
let input = vec![1.0f32];
let gate_proj = vec![1.0f32]; let up_proj = vec![1.0f32]; let down_proj = vec![1.0f32];
let output = brick
.forward(&input, &gate_proj, &up_proj, &down_proj)
.expect("operation failed");
let expected = 0.731;
assert!(
(output[0] - expected).abs() < 0.01,
"SwiGLU output {} should be ~{}",
output[0],
expected
);
}
#[test]
fn r008_activation_quant_timed() {
let brick = ActivationQuantBrick::new(1024);
let input: Vec<f32> = (0..1024).map(|i| i as f32 / 1024.0).collect();
let result = brick.execute_timed(&input).expect("operation failed");
assert_eq!(result.output.0.len(), 1024); assert!(result.us_per_token > 0.0);
assert!(result.tokens_per_sec > 0.0);
println!(
"ActivationQuant: {:.2}µs/tok, {:.0} tok/s",
result.us_per_token, result.tokens_per_sec
);
}
#[test]
fn r009_flash_attention_timed() {
let brick = FlashAttentionBrick::new(8, 2, 64);
let seq_len = 128;
let query = vec![0.1f32; 8 * 64];
let keys = vec![0.1f32; seq_len * 2 * 64];
let values = vec![0.1f32; seq_len * 2 * 64];
let result = brick
.forward_timed(&query, &keys, &values, seq_len)
.expect("operation failed");
assert_eq!(result.output.len(), 8 * 64);
assert!(result.us_per_token > 0.0);
println!(
"FlashAttention (seq={}): {:.2}µs/tok, {:.0} tok/s",
seq_len, result.us_per_token, result.tokens_per_sec
);
}
#[test]
#[cfg(feature = "cuda")]
fn r010_fused_ffn_timed() {
let hidden = 64;
let intermediate = 256;
let brick = FusedFfnBrick::new(hidden, intermediate);
let input = vec![0.1f32; hidden];
let gate_proj = vec![0.01f32; intermediate * hidden];
let up_proj = vec![0.01f32; intermediate * hidden];
let down_proj = vec![0.01f32; hidden * intermediate];
let result = brick
.forward_timed(&input, &gate_proj, &up_proj, &down_proj)
.expect("operation failed");
assert_eq!(result.output.len(), hidden);
assert!(result.us_per_token > 0.0);
println!(
"FusedFfn ({}x{}): {:.2}µs/tok, {:.0} tok/s",
hidden, intermediate, result.us_per_token, result.tokens_per_sec
);
}
#[cfg(feature = "cuda")]
#[test]
fn f009_brick_composition_typesafe() {
let _quant: &dyn ComputeBrick<Output = Vec<u8>> = &ActivationQuantBrick::new(64);
let _attn: &dyn ComputeBrick<Output = Vec<f32>> = &FlashAttentionBrick::new(4, 2, 8);
let _ffn: &dyn ComputeBrick<Output = Vec<f32>> = &FusedFfnBrick::new(64, 256);
}
#[test]
#[ignore = "flaky - budget assertion depends on hardware timing"]
fn f016_rmsnorm_normalizes() {
let weights = vec![1.0f32; 64];
let brick = RmsNormBrick::new(weights, 1e-5);
let input = vec![2.0f32; 64];
let result = brick.run(&input).expect("operation failed");
let output = result.output;
let rms: f32 = (output.iter().map(|x| x * x).sum::<f32>() / output.len() as f32).sqrt();
assert!(
(rms - 1.0).abs() < 0.1,
"RMSNorm output RMS {} should be ~1.0",
rms
);
}
#[test]
#[cfg(feature = "cuda")]
fn f020_brick_determinism() {
let brick = FusedFfnBrick::new(4, 8);
let input = vec![1.0f32; 4];
let gate = vec![0.1f32; 32];
let up = vec![0.2f32; 32];
let down = vec![0.1f32; 32];
let out1 = brick
.forward(&input, &gate, &up, &down)
.expect("operation failed");
let out2 = brick
.forward(&input, &gate, &up, &down)
.expect("operation failed");
assert_eq!(out1, out2, "Same input must produce same output");
}
#[test]
fn f023_rmsnorm_budget_target() {
let brick = RmsNormBrick::new(vec![1.0; 1024], 1e-5);
assert!(
brick.budget().us_per_token < 10.0,
"RmsNorm budget should be < 10µs"
);
}
#[test]
fn f024_attention_brick_budget() {
let brick = AttentionBrick::new(32, 8, 128);
assert!(
brick.budget().us_per_token < 50.0,
"Attention budget should be < 50µs"
);
}
#[test]
fn f028_ffn_budget_target() {
let brick = FfnBrick::new(1536, 8960);
assert!(
brick.budget().us_per_token < 100.0,
"FFN budget should be < 100µs"
);
}
#[test]
#[cfg(feature = "cuda")]
fn f029_fused_ffn_budget() {
let brick = FusedFfnBrick::new(1536, 8960);
assert!(
brick.budget().us_per_token < 50.0,
"FusedFFN budget should be < 50µs (2x improvement)"
);
}
#[test]
fn f030_throughput_target() {
let target = TokenBudget::from_throughput(976.0);
assert!(
target.us_per_token < 1100.0,
"976 tok/s should be ~1024µs/tok"
);
}
#[test]
#[cfg(feature = "cuda")]
fn f041_cpu_consistency() {
let brick = FusedFfnBrick::new(4, 8);
let input = vec![1.0f32; 4];
let gate = vec![0.1f32; 32];
let up = vec![0.2f32; 32];
let down = vec![0.1f32; 32];
let out1 = brick
.forward(&input, &gate, &up, &down)
.expect("operation failed");
let out2 = brick
.forward(&input, &gate, &up, &down)
.expect("operation failed");
for (a, b) in out1.iter().zip(out2.iter()) {
assert!(
(a - b).abs() < 1e-6,
"CPU output should be bit-identical across runs"
);
}
}
#[test]
fn f043_rope_properties() {
let brick = RopeBrick::new(64, 8, 10000.0, 0);
assert_eq!(brick.head_dim, 64, "Head dim should be 64");
assert_eq!(brick.num_heads, 8, "Num heads should be 8");
assert!(brick.theta > 0.0, "Theta should be positive");
assert!(
brick.budget().us_per_token < 5.0,
"RoPE budget should be < 5µs"
);
}
#[test]
fn f044_softmax_stability() {
let brick = FlashAttentionBrick::new(1, 1, 4);
let query = vec![100.0f32, 0.0, 0.0, 0.0];
let keys = vec![
100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -100.0, 0.0, 0.0, 0.0, ];
let values = vec![1.0f32; 12];
let output = brick
.forward(&query, &keys, &values, 3)
.expect("operation failed");
assert!(output.iter().all(|&v| !v.is_nan()), "No NaN in output");
assert!(output.iter().all(|&v| v.is_finite()), "All outputs finite");
}
#[test]
#[cfg(feature = "cuda")]
fn f047_swiglu_correctness() {
let brick = FusedFfnBrick::new(1, 1);
let input = vec![1.0f32];
let gate = vec![1.0f32];
let up = vec![1.0f32];
let down = vec![1.0f32];
let output = brick
.forward(&input, &gate, &up, &down)
.expect("operation failed");
let expected = 1.0 / (1.0 + (-1.0f32).exp()); assert!(
(output[0] - expected).abs() < 0.01,
"SwiGLU output {} should be ~{}",
output[0],
expected
);
}
#[test]
fn f048_rmsnorm_epsilon() {
let brick =
RmsNormBrick::new(vec![1.0; 4], 1e-5).with_budget(TokenBudget::from_latency(1000.0)); let input = vec![1e-10f32; 4];
let result = brick.run(&input).expect("operation failed");
let output = result.output;
assert!(
output.iter().all(|&v| !v.is_nan()),
"No NaN with small input"
);
assert!(
output.iter().all(|&v| v.is_finite()),
"All outputs finite with small input"
);
}
#[test]
fn f068_shared_memory_ready() {
let brick = FlashAttentionBrick::new(8, 2, 64);
assert!(brick.tile_size > 0, "Tile size should be set for tiling");
assert_eq!(brick.tile_size, 128, "Default tile size for L2 cache fit");
}
#[test]
#[cfg(feature = "cuda")]
fn f069_warp_infrastructure_ready() {
let brick = CoalescedDp4aBrick::new(256, 4);
assert!(
brick.k.is_multiple_of(256),
"K should be warp-aligned (256)"
);
}
#[test]
fn f071_launch_overhead_tracking() {
let brick = CudaGraphBrick::new(28, 1536);
assert!(brick.num_layers > 0, "Should track layer count");
}
#[test]
fn f072_memory_pool_ready() {
let brick = ActivationQuantBrick::new(4096);
let savings = brick.bytes_saved();
assert!(savings > 0, "Should track memory savings");
assert_eq!(savings, 4096 * 3, "f32→i8 saves 3 bytes/element");
}
#[test]
fn f074_budget_gap_factor() {
let budget = TokenBudget::from_latency(100.0);
let gap = budget.gap_factor(80.0);
assert!(gap < 1.0, "Under budget = gap < 1.0");
let gap = budget.gap_factor(120.0);
assert!(gap > 1.0, "Over budget = gap > 1.0");
}
#[test]
fn f079_error_propagation() {
let brick = ActivationQuantBrick::new(32);
let wrong_input = vec![1.0f32; 64]; let result = brick.quantize(&wrong_input);
assert!(result.is_err(), "Wrong input size should error");
if let Err(BrickError::InvalidInput(msg)) = result {
assert!(msg.contains("64"), "Error should mention actual size");
assert!(msg.contains("32"), "Error should mention expected size");
}
}
#[test]
#[allow(deprecated)] fn f080_edge_cases() {
let brick = ActivationQuantBrick::new(0);
assert!(brick.execute().is_err(), "Zero dim should error");
let flash = FlashAttentionBrick::new(0, 0, 0);
assert!(flash.execute(10).is_err(), "Zero heads/dim should error");
}
#[test]
fn f082_iteration_count() {
let config = BenchmarkConfig::default();
assert!(config.samples >= 100, "Need >= 100 samples for valid stats");
assert!(config.warmup >= 10, "Need >= 10 warmup iterations");
}