impl CudaExecutor {
fn debug_check_buf(
&mut self,
buf: &GpuBuffer<f32>,
label: &str,
layer_idx: usize,
) -> Result<(), GpuError> {
self.stream.synchronize()?;
let mut host = vec![0.0f32; buf.len()];
buf.copy_to_host(&mut host)?;
let nan_count = host.iter().filter(|x| x.is_nan()).count();
if nan_count > 0 {
eprintln!("[PAR-058-L{}] {} has {} NaN", layer_idx, label, nan_count);
} else {
eprintln!(
"[PAR-058-L{}] {} OK, first 3: {:?}",
layer_idx, label, &host[..3.min(host.len())]
);
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn gemv_dispatch(
&mut self,
qtype: WeightQuantType,
weight_ptr: u64,
input: &GpuBuffer<f32>,
output: &GpuBuffer<f32>,
n: u32,
k: u32,
) -> Result<(), GpuError> {
match qtype {
WeightQuantType::Q4_0 => self.q4_0_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q4_1 => self.q4_1_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q5_0 => self.q5_0_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q4K => self.q4k_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q5K => self.q5k_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q6K => self.q6k_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::Q8_0 => self.q8_0_gemv_into(weight_ptr, input, output, n, k),
WeightQuantType::F32 => self.f32_gemv_into(weight_ptr, input, output, n, k),
}
}
#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
pub(crate) fn transformer_layer_workspace_inner(
&mut self,
input: &GpuBuffer<f32>,
layer_idx: usize,
layer_weights: &ValidatedLayerWeights,
hidden_dim: u32,
intermediate_dim: u32,
epsilon: f32,
position: u32, skip_debug: bool,
) -> Result<(), GpuError> {
if !self.workspace.initialized {
return Err(GpuError::InvalidLaunchConfig(
"PAR-044: Workspace not initialized. Call init_workspace() first.".to_string(),
));
}
if layer_weights.attn_norm_ptr == 0 {
return Err(GpuError::InvalidParameter(
"attn_norm_ptr is null (0)".into(),
));
}
if layer_weights.attn_q_ptr == 0 {
return Err(GpuError::InvalidParameter("attn_q_ptr is null (0)".into()));
}
if layer_weights.attn_k_ptr == 0 {
return Err(GpuError::InvalidParameter("attn_k_ptr is null (0)".into()));
}
if layer_weights.attn_v_ptr == 0 {
return Err(GpuError::InvalidParameter("attn_v_ptr is null (0)".into()));
}
if layer_weights.attn_output_ptr == 0 {
return Err(GpuError::InvalidParameter(
"attn_output_ptr is null (0)".into(),
));
}
if layer_weights.ffn_norm_ptr == 0 {
return Err(GpuError::InvalidParameter(
"ffn_norm_ptr is null (0)".into(),
));
}
if layer_weights.ffn_gate_ptr == 0 {
return Err(GpuError::InvalidParameter(
"ffn_gate_ptr is null (0)".into(),
));
}
if layer_weights.ffn_up_ptr == 0 {
return Err(GpuError::InvalidParameter("ffn_up_ptr is null (0)".into()));
}
if layer_weights.ffn_down_ptr == 0 {
return Err(GpuError::InvalidParameter(
"ffn_down_ptr is null (0)".into(),
));
}
let q_dim = (self.kv_num_heads * self.kv_head_dim) as u32;
let kv_dim = (self.kv_num_kv_heads * self.kv_head_dim) as u32;
let hidden_buf1_ptr = self
.workspace
.hidden_buf1
.as_ref()
.expect("hidden_buf1 must be initialized")
.as_ptr();
let hidden_buf1_len = self
.workspace
.hidden_buf1
.as_ref()
.expect("hidden_buf1 must be initialized")
.len();
let hidden_buf2_ptr = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.as_ptr();
let hidden_buf2_len = self
.workspace
.hidden_buf2
.as_ref()
.expect("hidden_buf2 must be initialized")
.len();
let input_staging_ptr = self
.workspace
.input_staging
.as_ref()
.expect("input_staging must be initialized")
.as_ptr();
let input_staging_len = self
.workspace
.input_staging
.as_ref()
.expect("input_staging must be initialized")
.len();
let q_buf_ptr = self
.workspace
.q_buf
.as_ref()
.expect("q_buf must be initialized")
.as_ptr();
let q_buf_len = self
.workspace
.q_buf
.as_ref()
.expect("q_buf must be initialized")
.len();
let k_buf_ptr = self
.workspace
.k_buf
.as_ref()
.expect("k_buf must be initialized")
.as_ptr();
let k_buf_len = self
.workspace
.k_buf
.as_ref()
.expect("k_buf must be initialized")
.len();
let v_buf_ptr = self
.workspace
.v_buf
.as_ref()
.expect("v_buf must be initialized")
.as_ptr();
let v_buf_len = self
.workspace
.v_buf
.as_ref()
.expect("v_buf must be initialized")
.len();
let ffn_gate_ptr = self
.workspace
.ffn_gate_buf
.as_ref()
.expect("ffn_gate_buf must be initialized")
.as_ptr();
let ffn_gate_len = self
.workspace
.ffn_gate_buf
.as_ref()
.expect("ffn_gate_buf must be initialized")
.len();
let ffn_up_ptr = self
.workspace
.ffn_up_buf
.as_ref()
.expect("ffn_up_buf must be initialized")
.as_ptr();
let ffn_up_len = self
.workspace
.ffn_up_buf
.as_ref()
.expect("ffn_up_buf must be initialized")
.len();
let ffn_act_ptr = self
.workspace
.ffn_act_buf
.as_ref()
.expect("ffn_act_buf must be initialized")
.as_ptr();
let ffn_act_len = self
.workspace
.ffn_act_buf
.as_ref()
.expect("ffn_act_buf must be initialized")
.len();
let attn_out_ptr = self
.workspace
.attn_out_buf
.as_ref()
.expect("attn_out_buf must be initialized")
.as_ptr();
let attn_out_len = self
.workspace
.attn_out_buf
.as_ref()
.expect("attn_out_buf must be initialized")
.len();
let hidden_buf1 =
unsafe { GpuBuffer::<f32>::from_raw_parts(hidden_buf1_ptr, hidden_buf1_len) };
let hidden_buf2 =
unsafe { GpuBuffer::<f32>::from_raw_parts(hidden_buf2_ptr, hidden_buf2_len) };
let input_staging =
unsafe { GpuBuffer::<f32>::from_raw_parts(input_staging_ptr, input_staging_len) };
let q_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(q_buf_ptr, q_buf_len) };
let k_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(k_buf_ptr, k_buf_len) };
let v_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(v_buf_ptr, v_buf_len) };
let ffn_gate_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(ffn_gate_ptr, ffn_gate_len) };
let ffn_up_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(ffn_up_ptr, ffn_up_len) };
let ffn_act_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(ffn_act_ptr, ffn_act_len) };
let attn_out_buf = unsafe { GpuBuffer::<f32>::from_raw_parts(attn_out_ptr, attn_out_len) };
let profiling = self.profiler.is_enabled();
self.workspace_qkv_rope_phase(
input, &hidden_buf1, &q_buf, &k_buf, &v_buf,
layer_idx, layer_weights, hidden_dim, q_dim, kv_dim,
epsilon, position, skip_debug, profiling,
)?;
self.workspace_attention_residual_phase(
input, &hidden_buf1, &q_buf, &k_buf, &v_buf,
&attn_out_buf, &input_staging,
layer_idx, layer_weights, hidden_dim, q_dim,
skip_debug, profiling,
)?;
self.workspace_ffn_phase(
&hidden_buf1, &hidden_buf2, &input_staging,
&ffn_gate_buf, &ffn_up_buf, &ffn_act_buf,
layer_idx, layer_weights, hidden_dim, intermediate_dim,
epsilon, skip_debug, profiling,
)?;
std::mem::forget(hidden_buf1);
std::mem::forget(hidden_buf2);
std::mem::forget(input_staging);
std::mem::forget(q_buf);
std::mem::forget(k_buf);
std::mem::forget(v_buf);
std::mem::forget(attn_out_buf); std::mem::forget(ffn_gate_buf);
std::mem::forget(ffn_up_buf);
std::mem::forget(ffn_act_buf);
Ok(())
}
}
include!("indexed_transformer.rs");
include!("apply.rs");
include!("phase_attention.rs");
include!("indexed_ffn.rs");