1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
impl CudaExecutor {
/// Phase 3-5: Attention + output projection + residual1
#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
fn workspace_attention_residual_phase(
&mut self,
input: &GpuBuffer<f32>,
hidden_buf1: &GpuBuffer<f32>,
q_buf: &GpuBuffer<f32>,
k_buf: &GpuBuffer<f32>,
v_buf: &GpuBuffer<f32>,
attn_out_buf: &GpuBuffer<f32>,
input_staging: &GpuBuffer<f32>,
layer_idx: usize,
layer_weights: &ValidatedLayerWeights,
hidden_dim: u32,
q_dim: u32,
skip_debug: bool,
profiling: bool,
) -> Result<(), GpuError> {
// 3. PAR-051: Incremental attention into pre-allocated workspace buffer
// Eliminates 28 GPU allocations per token
// PAR-054-FIX: Use capture-safe version during graph capture to skip debug sync
let timer_attn = if profiling {
self.start_brick_id(trueno::BrickId::AttentionScore)
} else {
None
};
// GH-559 DIAGNOSTIC: Force non-capture attention path to test if
// incremental_attention_into_for_capture has a bug on sm_121.
// The _for_capture variant may produce wrong results on Blackwell.
let _seq_len = self.incremental_attention_into(layer_idx, q_buf, k_buf, v_buf, attn_out_buf)?;
if profiling {
self.stop_brick_id(timer_attn, 1);
}
// PAR-058-DEBUG: Check attention output (skip during graph capture)
// Note: attention runs on compute_stream, so sync that first
if !skip_debug && layer_idx < 4 {
self.compute_stream.synchronize()?;
self.debug_check_buf(attn_out_buf, "Attn", layer_idx)?;
}
// GH-559-PERF: Event-based cross-stream dependency (was: compute_stream.synchronize()).
// Attention runs on compute_stream, output projection reads attn_out_buf on self.stream.
// Record event on compute_stream, then make self.stream wait for it.
// This preserves correctness (output projection waits for attention) without blocking
// the CPU or stalling other GPU work — saves ~0.5-1.0ms/token (28 layers × per-token).
//
// Five-Whys (F-PARITY-02): realizr 1.59x slower → serving overhead → full GPU sync
// 28x per token in attention phase → cuStreamSynchronize blocks CPU+GPU →
// replaced with cuStreamWaitEvent (non-blocking GPU-side ordering).
{
let event = self.attention_event.get_or_insert_with(|| {
CudaEvent::new().expect("GH-559-PERF: attention event creation")
});
self.compute_stream.record_event(event)?;
self.stream.wait_event(event)?;
}
// PMAT-027: Invalidate Q8 cache — input is now attn_out_buf (different from QKV's hidden_buf1).
self.q8_activation_valid = false;
// 4. Output projection: attn_out_buf -> hidden_buf1 (reuse, normed no longer needed)
let timer_oproj = if profiling {
self.start_brick_id(trueno::BrickId::OutputProjection)
} else {
None
};
self.gemv_dispatch(
layer_weights.attn_output_qtype,
layer_weights.attn_output_ptr,
attn_out_buf, hidden_buf1, hidden_dim, q_dim,
)?;
if profiling {
self.stop_brick_id(timer_oproj, 1);
}
// PAR-058-DEBUG: Check output projection (skip during graph capture)
if !skip_debug && layer_idx < 4 {
self.debug_check_buf(hidden_buf1, "Output proj", layer_idx)?;
}
// 5. First residual: input + projected -> input_staging (PAR-044 FIX)
// NOTE: Using input_staging instead of hidden_buf2 to avoid read/write conflict
// when input IS hidden_buf2 (layers 1+)
// PAR-075: Cannot fuse with RmsNorm2 because we need input_staging for second residual
let timer_res1 = if profiling {
self.start_brick_timer("Residual1")
} else {
None
};
self.residual_add_into(input, hidden_buf1, input_staging, hidden_dim)?;
if profiling {
self.stop_brick_timer(timer_res1, 1);
}
// PAR-058-DEBUG: Check residual1 output (skip during graph capture)
if !skip_debug && layer_idx < 4 {
self.debug_check_buf(input_staging, "Residual1", layer_idx)?;
}
Ok(())
}
}