1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// Graph-captured forward pass for single-token decode with kernel launch coalescing.
impl CudaExecutor {
/// PAR-054: Graph-captured forward pass for decode (M=1)
///
/// Uses CUDA graph capture to reduce kernel launch overhead from ~280 launches
/// to 1 graph launch (~10µs vs ~5.6ms overhead).
///
/// First decode token: captures the kernel sequence into a graph
/// Subsequent tokens: replays the captured graph with updated position
///
/// # Performance
///
/// - Without graphs: ~280 kernel launches × ~20µs = ~5.6ms overhead/token
/// - With graphs: 1 graph launch × ~10µs = ~0.01ms overhead/token
/// - Expected speedup: ~500x reduction in launch overhead
#[allow(clippy::too_many_arguments)]
pub fn forward_all_layers_gpu_to_logits_graphed(
&mut self,
input: &[f32],
logits: &mut [f32],
position: u32,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
vocab_size: u32,
epsilon: f32,
) -> Result<(), GpuError> {
// C-GDP-001: Profiling requires eager path for per-brick instrumentation.
// CUDA graph replay executes all kernels in one opaque launch, hiding bricks.
// Contract: gpu-decode-profiling-v1 FALSIFY-GDP-001.
// PAR-118: Skip capture if previous attempt failed (error 901 → CUDA corruption).
if self.should_use_eager_decode() || self.graph_capture_failed {
return self.forward_all_layers_gpu_to_logits(
input, logits, position, num_layers, hidden_dim,
intermediate_dim, vocab_size, epsilon,
);
}
// PAR-054: Replay captured graph if available
if self.decode_graph.is_some() && self.decode_token_count > 0 {
if self.decode_token_count <= 3 && verbose() {
eprintln!(
"[PAR-054] Graph replay #{} (pos={})",
self.decode_token_count, position
);
}
return self.forward_graphed_replay(input, logits, position);
}
// First token: attempt graph capture (requires workspace + lm_head)
self.try_first_token_graph_capture(
input, logits, position, num_layers, hidden_dim,
intermediate_dim, vocab_size, epsilon,
)
}
/// Returns true if the eager (non-graphed) decode path should be used.
/// GH-559-PERF: Default to eager. Graph capture poisons CUDA context on
/// drivers 570.207 and 590.48.01 (CUDA_ERROR_UNKNOWN code 901).
/// Opt-in with CUDA_GRAPH_ENABLE=1.
fn should_use_eager_decode(&self) -> bool {
static GRAPH_ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
let graph_enabled = *GRAPH_ENABLED.get_or_init(|| {
std::env::var("CUDA_GRAPH_ENABLE")
.map(|v| v == "1")
.unwrap_or(false)
});
!graph_enabled || self.profiler.is_enabled()
}
/// First-token graph capture: initialize buffers, attempt capture, fallback on failure.
#[allow(clippy::too_many_arguments)]
fn try_first_token_graph_capture(
&mut self,
input: &[f32],
logits: &mut [f32],
position: u32,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
vocab_size: u32,
epsilon: f32,
) -> Result<(), GpuError> {
let use_workspace = self.has_workspace()
&& self.has_indexed_weights()
&& self.indexed_layer_weights.len() == num_layers;
if !use_workspace {
eprintln!("[PAR-054] Workspace not ready, using non-graphed path (has_workspace={}, has_indexed={}, layers={})",
self.has_workspace(), self.has_indexed_weights(), self.indexed_layer_weights.len());
return self.forward_all_layers_gpu_to_logits(
input, logits, position, num_layers, hidden_dim,
intermediate_dim, vocab_size, epsilon,
);
}
if self.lm_head_ptr == 0 {
eprintln!("[PAR-054] lm_head_ptr not set, using non-graphed path");
return self.forward_all_layers_gpu_to_logits(
input, logits, position, num_layers, hidden_dim,
intermediate_dim, vocab_size, epsilon,
);
}
self.prepare_capture_buffers(input, position, hidden_dim, vocab_size)?;
self.preload_modules_for_capture(num_layers, hidden_dim, intermediate_dim, vocab_size)?;
// trueno#243: Skip stream capture (code 901 poisons context on driver 570.207).
// Go directly to manual graph construction via cuGraphAddKernelNode.
// Run one eager forward with recording, then build graph from records.
eprintln!("[trueno#243] Manual graph construction (skipping stream capture)...");
self.begin_graph_recording();
self.is_capturing = true;
let eager_result = self.forward_workspace_captured(
num_layers, hidden_dim, intermediate_dim, vocab_size, epsilon,
);
self.is_capturing = false;
if let Err(eager_err) = eager_result {
self.graph_recording = false;
self.graph_capture_failed = true;
eprintln!("[trueno#243] Eager forward during recording failed: {:?}", eager_err);
return self.forward_all_layers_gpu_to_logits(
input, logits, position, num_layers, hidden_dim,
intermediate_dim, vocab_size, epsilon,
);
}
// Eager pass succeeded — now build graph from recorded kernels
match self.end_graph_recording() {
Ok(n) if n > 0 => {
// Manual graph built! First token already computed by eager pass.
// Download logits from workspace buffer.
self.stream.synchronize()?;
if let Some(ref logits_buf) = self.workspace.logits_buf {
logits_buf.copy_to_host(logits)?;
}
Ok(())
},
Ok(_) => {
eprintln!("[trueno#243] No kernels recorded (recording not wired to all ops yet)");
// First token was computed by eager pass, download logits
self.stream.synchronize()?;
if let Some(ref logits_buf) = self.workspace.logits_buf {
logits_buf.copy_to_host(logits)?;
}
// Mark graph as failed so subsequent tokens use eager path
self.graph_capture_failed = true;
Ok(())
},
Err(graph_err) => {
self.graph_capture_failed = true;
eprintln!("[trueno#243] Manual graph build failed: {:?}, using eager", graph_err);
// First token was already computed, just download logits
self.stream.synchronize()?;
if let Some(ref logits_buf) = self.workspace.logits_buf {
logits_buf.copy_to_host(logits)?;
}
Ok(())
},
}
}
/// Initialize GPU buffers required for CUDA graph capture.
fn prepare_capture_buffers(
&mut self,
input: &[f32],
position: u32,
hidden_dim: u32,
vocab_size: u32,
) -> Result<(), GpuError> {
// Position buffer
match self.position_buf {
None => { self.position_buf = Some(GpuBuffer::from_host(&self.context, &[position])?); },
Some(ref mut buf) => { buf.copy_from_host(&[position])?; },
}
// PAR-061: seq_len = position + 1
let seq_len = position + 1;
match self.seq_len_buf {
None => { self.seq_len_buf = Some(GpuBuffer::from_host(&self.context, &[seq_len])?); },
Some(ref mut buf) => { buf.copy_from_host(&[seq_len])?; },
}
// Stable input buffer
let hidden_size = hidden_dim as usize;
let needs_new = self.graph_input_buf.as_ref().map_or(true, |b| b.len() != hidden_size);
if needs_new {
self.graph_input_buf = Some(GpuBuffer::from_host(&self.context, input)?);
} else {
self.graph_input_buf.as_mut().unwrap().copy_from_host(input)?;
}
// Pre-allocate workspace buffers
if self.workspace.normed_hidden_buf.is_none() {
self.workspace.normed_hidden_buf = Some(GpuBuffer::new(&self.context, hidden_size)?);
}
// PMAT-088: Logits buffer may have been resized by batched decode (M*vocab_size).
// Must reallocate to exact vocab_size for M=1 graph capture, otherwise
// copy_to_host will fail with "Length mismatch" (host=vocab vs device=M*vocab).
let needs_logits = self.workspace.logits_buf.as_ref().map_or(true, |b| {
b.len() != vocab_size as usize
});
if needs_logits {
self.workspace.logits_buf = Some(GpuBuffer::new(&self.context, vocab_size as usize)?);
}
Ok(())
}
}