impl CudaExecutor {
fn validate_rmsnorm_cache_for_logits(
&self,
num_layers: usize,
) -> Result<(), GpuError> {
for layer_idx in 0..num_layers {
let attn_name = format!("blk.{}.attn_norm.gamma", layer_idx);
if !self.rmsnorm_cache.contains_key(&attn_name) {
return Err(GpuError::InvalidLaunchConfig(format!(
"PAR-023: attn_norm not cached for layer {}",
layer_idx
)));
}
let ffn_name = format!("blk.{}.ffn_norm.gamma", layer_idx);
if !self.rmsnorm_cache.contains_key(&ffn_name) {
return Err(GpuError::InvalidLaunchConfig(format!(
"PAR-023: ffn_norm not cached for layer {}",
layer_idx
)));
}
}
if !self.rmsnorm_cache.contains_key("output_norm.gamma") {
return Err(GpuError::InvalidLaunchConfig(
"PAR-023: output_norm not cached".to_string(),
));
}
Ok(())
}
fn pad_and_upload_input(&self, input: &[f32]) -> Result<GpuBuffer<f32>, GpuError> {
let padded_len = ((input.len() + 255) / 256) * 256;
let padded_input: std::borrow::Cow<'_, [f32]> = if padded_len > input.len() {
let mut padded = vec![0.0f32; padded_len];
padded[..input.len()].copy_from_slice(input);
std::borrow::Cow::Owned(padded)
} else {
std::borrow::Cow::Borrowed(input)
};
GpuBuffer::from_host(&self.context, &padded_input)
}
#[allow(clippy::too_many_arguments)]
fn run_workspace_layers(
&mut self,
hidden_gpu: &GpuBuffer<f32>,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
position: u32,
) -> Result<(), GpuError> {
let debug_layers = Self::debug_layers_enabled();
Self::debug_dump_input(debug_layers, hidden_gpu, hidden_dim);
if num_layers > 0 {
let layer_weights = self.indexed_layer_weights[0].clone();
self.transformer_layer_workspace(
hidden_gpu,
0,
&layer_weights,
hidden_dim,
intermediate_dim,
epsilon,
position,
)?;
}
self.debug_dump_layer0_output(debug_layers, hidden_dim);
self.run_remaining_workspace_layers(
num_layers,
hidden_dim,
intermediate_dim,
epsilon,
position,
debug_layers,
)
}
fn debug_layers_enabled() -> bool {
static DEBUG_LAYERS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
*DEBUG_LAYERS.get_or_init(|| {
std::env::var("GPU_DEBUG_ALL_LAYERS")
.map(|v| v == "1")
.unwrap_or(false)
})
}
fn debug_dump_input(enabled: bool, hidden_gpu: &GpuBuffer<f32>, hidden_dim: u32) {
if !enabled {
return;
}
let n = (hidden_dim as usize).min(hidden_gpu.len());
let mut host = vec![0.0f32; n];
if hidden_gpu.copy_to_host(&mut host).is_ok() {
let sum: f32 = host.iter().sum();
let rms = (host.iter().map(|x| x * x).sum::<f32>() / n as f32).sqrt();
eprintln!(
"[GH-559] Layer 0 INPUT (hidden_gpu embed): sum={:.6}, rms={:.6}, first5={:?}",
sum, rms, &host[..5.min(n)]
);
}
}
fn debug_dump_layer0_output(&self, enabled: bool, hidden_dim: u32) {
if !enabled {
return;
}
let Some(buf2) = self.workspace.hidden_buf2.as_ref() else {
return;
};
let n = (hidden_dim as usize).min(buf2.len());
let mut host = vec![0.0f32; n];
if buf2.copy_to_host(&mut host).is_err() {
return;
}
let sum: f32 = host.iter().sum();
let rms = (host.iter().map(|x| x * x).sum::<f32>() / n as f32).sqrt();
eprintln!(
"[GH-559] Layer 0/28 OUTPUT (hidden_buf2): sum={:.6}, rms={:.6}, first5={:?}",
sum, rms, &host[..5.min(n)]
);
for sb in 0..(n / 256) {
let idx = sb * 256;
let end = (idx + 5).min(n);
let sb_sum: f32 = host[idx..idx + 256.min(n - idx)].iter().sum();
eprintln!(
"[GH-559-GPU] L0 sb{}: idx={}, sum={:.4}, vals={:?}",
sb, idx, sb_sum, &host[idx..end]
);
}
}
fn run_remaining_workspace_layers(
&mut self,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
position: u32,
debug_layers: bool,
) -> Result<(), GpuError> {
for layer_idx in 1..num_layers {
let layer_weights = self.indexed_layer_weights[layer_idx].clone();
let buf2 = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized");
let buf_ptr = buf2.as_ptr();
let buf_len = buf2.len();
let input_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(buf_ptr, buf_len) };
Self::debug_dump_layer_input(
debug_layers,
&input_buf,
buf_len,
hidden_dim,
layer_idx,
num_layers,
);
self.transformer_layer_workspace(
&input_buf,
layer_idx,
&layer_weights,
hidden_dim,
intermediate_dim,
epsilon,
position,
)?;
std::mem::forget(input_buf);
}
Ok(())
}
fn debug_dump_layer_input(
enabled: bool,
input_buf: &GpuBuffer<f32>,
buf_len: usize,
hidden_dim: u32,
layer_idx: usize,
num_layers: usize,
) {
if !enabled {
return;
}
let mut host_buf = vec![0.0f32; buf_len.min(hidden_dim as usize)];
if input_buf.copy_to_host(&mut host_buf).is_ok() {
let sum: f32 = host_buf.iter().sum();
let rms: f32 =
(host_buf.iter().map(|x| x * x).sum::<f32>() / host_buf.len() as f32).sqrt();
eprintln!(
"[GH-559] Layer {}/{} input: sum={:.6}, rms={:.6}, first5={:?}",
layer_idx,
num_layers,
sum,
rms,
&host_buf[..5.min(host_buf.len())]
);
}
}
#[allow(clippy::too_many_arguments)]
fn run_indexed_layers(
&mut self,
mut hidden_gpu: GpuBuffer<f32>,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
) -> Result<GpuBuffer<f32>, GpuError> {
for layer_idx in 0..num_layers {
let layer_weights = self.indexed_layer_weights[layer_idx].clone();
hidden_gpu = self.transformer_layer_indexed(
&hidden_gpu,
layer_idx,
&layer_weights,
hidden_dim,
intermediate_dim,
epsilon,
)?;
}
Ok(hidden_gpu)
}
#[allow(clippy::too_many_arguments)]
fn run_legacy_layers(
&mut self,
mut hidden_gpu: GpuBuffer<f32>,
num_layers: usize,
layer_keys: &[(String, String)],
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
) -> Result<GpuBuffer<f32>, GpuError> {
for layer_idx in 0..num_layers {
let prefix = format!("blk.{}", layer_idx);
let (ref attn_name, ref ffn_name) = layer_keys[layer_idx];
let attn_gamma = self.rmsnorm_cache.get(attn_name).ok_or_else(|| {
GpuError::InvalidLaunchConfig(format!(
"PAR-023: Missing cached gamma for {}",
attn_name
))
})?;
let attn_ptr = attn_gamma.as_ptr();
let attn_len = attn_gamma.len();
let ffn_gamma = self.rmsnorm_cache.get(ffn_name).ok_or_else(|| {
GpuError::InvalidLaunchConfig(format!(
"PAR-023: Missing cached gamma for {}",
ffn_name
))
})?;
let ffn_ptr = ffn_gamma.as_ptr();
let ffn_len = ffn_gamma.len();
hidden_gpu = self.transformer_layer_gpu_cached(
&hidden_gpu,
layer_idx,
&prefix,
hidden_dim,
intermediate_dim,
attn_ptr,
attn_len,
ffn_ptr,
ffn_len,
epsilon,
)?;
}
Ok(hidden_gpu)
}
fn debug_dump_hidden_state(
&mut self,
hidden_gpu: &GpuBuffer<f32>,
workspace_used: bool,
debug_enabled: bool,
) -> Result<(), GpuError> {
if !debug_enabled {
return Ok(());
}
self.stream.synchronize()?;
let hidden_to_check = if workspace_used {
let ptr = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.as_ptr();
let len = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.len();
unsafe { GpuBuffer::<f32>::from_raw_parts(ptr, len) }
} else {
unsafe { GpuBuffer::<f32>::from_raw_parts(hidden_gpu.as_ptr(), hidden_gpu.len()) }
};
let mut hidden_host = vec![0.0f32; hidden_to_check.len()];
hidden_to_check.copy_to_host(&mut hidden_host)?;
std::mem::forget(hidden_to_check);
let sum: f32 = hidden_host.iter().sum();
let sum_sq: f32 = hidden_host.iter().map(|x| x * x).sum();
eprintln!(
"[CORRECTNESS-001] Hidden before output_norm: first 5 = {:?}, sum = {:.4}, rms = {:.4}",
&hidden_host[..5.min(hidden_host.len())],
sum,
(sum_sq / hidden_host.len() as f32).sqrt()
);
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn apply_output_rmsnorm(
&mut self,
hidden_gpu: &GpuBuffer<f32>,
workspace_used: bool,
hidden_dim: u32,
epsilon: f32,
) -> Result<GpuBuffer<f32>, GpuError> {
let output_norm_gamma = self.rmsnorm_cache.get("output_norm.gamma").ok_or_else(|| {
GpuError::InvalidLaunchConfig(
"PAR-023: Missing cached gamma for output_norm.gamma".to_string(),
)
})?;
let output_gamma_ptr = output_norm_gamma.as_ptr();
let output_gamma_len = output_norm_gamma.len();
if workspace_used {
let hidden_ptr = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.as_ptr();
let hidden_len = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.len();
let hidden_input = unsafe { GpuBuffer::<f32>::from_raw_parts(hidden_ptr, hidden_len) };
let result = self.rmsnorm_gpu_ptr(
&hidden_input,
output_gamma_ptr,
output_gamma_len,
hidden_dim,
epsilon,
)?;
std::mem::forget(hidden_input);
Ok(result)
} else {
self.rmsnorm_gpu_ptr(
hidden_gpu,
output_gamma_ptr,
output_gamma_len,
hidden_dim,
epsilon,
)
}
}
fn debug_dump_normed_hidden(
&mut self,
normed_hidden: &GpuBuffer<f32>,
debug_enabled: bool,
) -> Result<(), GpuError> {
if !debug_enabled {
return Ok(());
}
self.stream.synchronize()?;
let mut normed_host = vec![0.0f32; normed_hidden.len()];
normed_hidden.copy_to_host(&mut normed_host)?;
let sum: f32 = normed_host.iter().sum();
let sum_sq: f32 = normed_host.iter().map(|x| x * x).sum();
eprintln!(
"[CORRECTNESS-002] Normed hidden: first 5 = {:?}, sum = {:.4}, rms = {:.4}",
&normed_host[..5.min(normed_host.len())],
sum,
(sum_sq / normed_host.len() as f32).sqrt()
);
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub fn forward_all_layers_gpu_to_logits(
&mut self,
input: &[f32],
logits: &mut [f32],
position: u32,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
vocab_size: u32,
epsilon: f32,
) -> Result<(), GpuError> {
self.validate_rmsnorm_cache_for_logits(num_layers)?;
let hidden_gpu = self.pad_and_upload_input(input)?;
let (hidden_gpu, workspace_used) = self.run_transformer_stack(
hidden_gpu, num_layers, hidden_dim, intermediate_dim, epsilon, position,
)?;
let debug_enabled = Self::gpu_debug_enabled();
self.debug_dump_hidden_state(&hidden_gpu, workspace_used, debug_enabled)?;
let normed_hidden = self.apply_output_rmsnorm_timed(
&hidden_gpu, workspace_used, hidden_dim, epsilon,
)?;
self.debug_dump_normed_hidden(&normed_hidden, debug_enabled)?;
self.dispatch_lm_head_and_download(
&normed_hidden, logits, vocab_size, hidden_dim, debug_enabled,
)
}
fn run_transformer_stack(
&mut self,
mut hidden_gpu: GpuBuffer<f32>,
num_layers: usize,
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
position: u32,
) -> Result<(GpuBuffer<f32>, bool), GpuError> {
let use_workspace = self.has_workspace()
&& self.has_indexed_weights()
&& self.indexed_layer_weights.len() == num_layers;
if use_workspace {
self.run_workspace_layers(
&hidden_gpu, num_layers, hidden_dim, intermediate_dim, epsilon, position,
)?;
return Ok((hidden_gpu, true));
}
if self.has_indexed_weights() && self.indexed_layer_weights.len() == num_layers {
hidden_gpu = self.run_indexed_layers(
hidden_gpu, num_layers, hidden_dim, intermediate_dim, epsilon,
)?;
return Ok((hidden_gpu, false));
}
let layer_keys: Vec<(String, String)> = (0..num_layers)
.map(|i| {
(
format!("blk.{}.attn_norm.gamma", i),
format!("blk.{}.ffn_norm.gamma", i),
)
})
.collect();
hidden_gpu = self.run_legacy_layers(
hidden_gpu, num_layers, &layer_keys, hidden_dim, intermediate_dim, epsilon,
)?;
Ok((hidden_gpu, false))
}
fn gpu_debug_enabled() -> bool {
static HIDDEN_DEBUG: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
*HIDDEN_DEBUG.get_or_init(|| {
std::env::var("GPU_DEBUG")
.map(|v| v == "1")
.unwrap_or(false)
})
}
fn apply_output_rmsnorm_timed(
&mut self,
hidden_gpu: &GpuBuffer<f32>,
workspace_used: bool,
hidden_dim: u32,
epsilon: f32,
) -> Result<GpuBuffer<f32>, GpuError> {
let profiling = self.profiler.is_enabled();
let timer = if profiling {
self.start_brick_id(trueno::BrickId::RmsNorm)
} else {
None
};
let normed = self.apply_output_rmsnorm(hidden_gpu, workspace_used, hidden_dim, epsilon)?;
if profiling {
self.stop_brick_id(timer, 1);
}
Ok(normed)
}
pub fn hidden_to_logits(
&mut self,
hidden_state: &[f32],
logits: &mut [f32],
hidden_dim: u32,
vocab_size: u32,
epsilon: f32,
) -> Result<(), GpuError> {
let hidden_gpu = self.pad_and_upload_input(hidden_state)?;
let normed_hidden = self.apply_output_rmsnorm(
&hidden_gpu, false, hidden_dim, epsilon,
)?;
self.dispatch_lm_head_and_download(
&normed_hidden, logits, vocab_size, hidden_dim, false,
)
}
}
include!("logits.rs");
include!("forward_from_forward_from_forward.rs");