1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
impl AprV2ModelCuda {
// ========================================================================
// Weight Pre-caching (2x performance optimization)
// ========================================================================
/// Extract model dimension config from metadata.
///
/// Returns `(hidden_dim, num_layers, num_heads, num_kv_heads, intermediate_dim, head_dim, kv_dim)`.
fn extract_model_dims(&self) -> (usize, usize, usize, usize, usize, usize, usize) {
let hidden_dim = self.model.metadata.hidden_size.unwrap_or(0);
let num_layers = self.model.metadata.num_layers.unwrap_or(0);
let num_heads = self.model.metadata.num_heads.unwrap_or(1);
let num_kv_heads = self.model.metadata.num_kv_heads.unwrap_or(num_heads);
let intermediate_dim = self
.model
.metadata
.intermediate_size
.unwrap_or(hidden_dim * 4);
let head_dim = if num_heads > 0 {
hidden_dim / num_heads
} else {
0
};
let kv_dim = num_kv_heads * head_dim;
(hidden_dim, num_layers, num_heads, num_kv_heads, intermediate_dim, head_dim, kv_dim)
}
/// Upload a single weight tensor (quantized or F32) to GPU.
///
/// Uses GGUF-style cache names for compatibility with `build_indexed_weights()`.
/// PMAT-113: Now caches F32 weights for GPU GEMM (was causing APR CUDA hang).
///
/// # Returns
///
/// `(bytes_uploaded, is_quantized)`
fn upload_single_weight(&mut self, src_name: &str, cache_name: &str) -> (usize, bool) {
// Clone dtype and shape upfront to release the immutable borrow on self.model
// before calling &mut self methods below.
let (dtype, shape) = match self.model.get_tensor(src_name) {
Some(entry) => (entry.dtype.clone(), entry.shape.clone()),
None => return (0, false),
};
if let Some(qtype) = dtype_to_ggml_qtype(&dtype) {
// Quantized: upload raw bytes to quantized_weight_cache
self.upload_quantized_weight(src_name, cache_name, qtype)
} else {
// PMAT-113: F32/F16 - cache on GPU for GEMM path
// PMAT-222: Transpose 2D F32 weights from [n, k] to [k, n] for gemm_b_cached
// HF convention stores weights as [out_dim, in_dim] but GEMM needs B[k, n]
self.upload_f32_weight(src_name, cache_name, &shape)
}
}
/// Upload a quantized weight tensor to GPU.
fn upload_quantized_weight(
&mut self,
src_name: &str,
cache_name: &str,
qtype: u32,
) -> (usize, bool) {
if let Ok(bytes) = self.model.get_tensor_bytes(src_name) {
let size = self
.executor
.load_quantized_weights_with_type(cache_name, bytes, qtype)
.unwrap_or(0);
(size, true)
} else {
(0, false)
}
}
/// Upload an F32 weight tensor to GPU, transposing 2D weights for GEMM.
fn upload_f32_weight(
&mut self,
src_name: &str,
cache_name: &str,
shape: &[usize],
) -> (usize, bool) {
if let Ok(weights) = self.model.get_tensor_f32(src_name) {
let final_weights = if shape.len() == 2 {
let rows = shape[0]; // out_dim (n)
let cols = shape[1]; // in_dim (k)
let mut transposed = vec![0.0f32; weights.len()];
for i in 0..rows {
for j in 0..cols {
transposed[j * rows + i] = weights[i * cols + j];
}
}
transposed
} else {
weights
};
let size = self
.executor
.load_weights(cache_name, &final_weights)
.unwrap_or(0);
(size, false)
} else {
(0, false)
}
}
/// Build per-layer weight name mappings for projection weights.
///
/// Returns a vec of `(source_patterns, cache_suffix)` for Q/K/V/O and FFN gate/up/down.
fn layer_projection_mappings(layer_idx: usize) -> Vec<(Vec<String>, &'static str)> {
vec![
(
vec![
format!("model.layers.{layer_idx}.self_attn.q_proj.weight"),
format!("layers.{layer_idx}.self_attn.q_proj.weight"),
format!("blk.{layer_idx}.attn_q.weight"),
],
"attn_q.weight",
),
(
vec![
format!("model.layers.{layer_idx}.self_attn.k_proj.weight"),
format!("layers.{layer_idx}.self_attn.k_proj.weight"),
format!("blk.{layer_idx}.attn_k.weight"),
],
"attn_k.weight",
),
(
vec![
format!("model.layers.{layer_idx}.self_attn.v_proj.weight"),
format!("layers.{layer_idx}.self_attn.v_proj.weight"),
format!("blk.{layer_idx}.attn_v.weight"),
],
"attn_v.weight",
),
(
vec![
format!("model.layers.{layer_idx}.self_attn.o_proj.weight"),
format!("layers.{layer_idx}.self_attn.o_proj.weight"),
format!("blk.{layer_idx}.attn_output.weight"),
],
"attn_output.weight",
),
(
vec![
format!("model.layers.{layer_idx}.mlp.gate_proj.weight"),
format!("layers.{layer_idx}.mlp.gate_proj.weight"),
format!("blk.{layer_idx}.ffn_gate.weight"),
],
"ffn_gate.weight",
),
(
vec![
format!("model.layers.{layer_idx}.mlp.up_proj.weight"),
format!("layers.{layer_idx}.mlp.up_proj.weight"),
format!("blk.{layer_idx}.ffn_up.weight"),
],
"ffn_up.weight",
),
(
vec![
format!("model.layers.{layer_idx}.mlp.down_proj.weight"),
format!("layers.{layer_idx}.mlp.down_proj.weight"),
format!("blk.{layer_idx}.ffn_down.weight"),
],
"ffn_down.weight",
),
]
}
/// Cache per-layer projection weights (Q/K/V/O + FFN gate/up/down).
///
/// Uses GGUF naming convention to match `build_indexed_weights()` expectations.
///
/// # Returns
///
/// `(total_bytes, quantized_count, f32_count)`
fn cache_layer_projections(&mut self, layer_idx: usize) -> (usize, usize, usize) {
let prefix = format!("blk.{layer_idx}");
let weight_mappings = Self::layer_projection_mappings(layer_idx);
let mut total_bytes = 0;
let mut quantized_count = 0;
let mut f32_count = 0;
for (patterns, suffix) in weight_mappings {
let patterns_ref: Vec<&str> = patterns.iter().map(String::as_str).collect();
if let Ok(src_name) = self.model.find_tensor_name(&patterns_ref) {
let cache_name = format!("{prefix}.{suffix}");
let (bytes, is_quantized) = self.upload_single_weight(&src_name, &cache_name);
if bytes > 0 {
total_bytes += bytes;
if is_quantized {
quantized_count += 1;
} else {
f32_count += 1;
}
}
}
}
(total_bytes, quantized_count, f32_count)
}
/// Cache fused QKV weight by unfusing into separate Q/K/V tensors.
///
/// PMAT-113: APR models from HuggingFace have Q/K/V fused into `qkv_proj.weight`.
/// Unfuse and cache as separate Q/K/V with names the forward path expects.
/// NOTE: P1 quality issue exists (SATD-WARNING in `generate_cuda_with_cache`).
/// The APR import has corrupt tensor layouts - this caching doesn't fix that.
///
/// # Returns
///
/// `(total_bytes, f32_count)`
fn cache_fused_qkv(
&mut self,
layer_idx: usize,
hidden_dim: usize,
kv_dim: usize,
) -> (usize, usize) {
let fused_qkv_patterns = vec![format!(
"model.layers.{layer_idx}.self_attn.qkv_proj.weight"
)];
let fused_patterns_ref: Vec<&str> =
fused_qkv_patterns.iter().map(String::as_str).collect();
let src_name = match self.model.find_tensor_name(&fused_patterns_ref) {
Ok(name) => name,
Err(_) => return (0, 0),
};
// Load and unfuse QKV for F32 models
let qkv_weight = match self.model.get_tensor_f32(&src_name) {
Ok(w) => w,
Err(_) => return (0, 0),
};
// Unfuse: Q is first hidden_dim rows, K is next kv_dim, V is last kv_dim
let q_size = hidden_dim * hidden_dim;
let k_size = kv_dim * hidden_dim;
let v_size = kv_dim * hidden_dim;
if qkv_weight.len() < q_size + k_size + v_size {
return (0, 0);
}
let q_weight: Vec<f32> = qkv_weight[0..q_size].to_vec();
let k_weight: Vec<f32> = qkv_weight[q_size..q_size + k_size].to_vec();
let v_weight: Vec<f32> =
qkv_weight[q_size + k_size..q_size + k_size + v_size].to_vec();
// PMAT-114: Trace K weight for layer 0 to debug 100x difference
if layer_idx == 0 && std::env::var("APR_TRACE_LAYERS").is_ok() {
Self::trace_k_weight_pre_transpose(&k_weight);
}
// Transpose for GPU GEMM (row-major to column-major)
let q_weight_t = transpose_matrix(&q_weight, hidden_dim, hidden_dim);
let k_weight_t = transpose_matrix(&k_weight, kv_dim, hidden_dim);
let v_weight_t = transpose_matrix(&v_weight, kv_dim, hidden_dim);
// PMAT-114: Trace K weight after transpose
if layer_idx == 0 && std::env::var("APR_TRACE_LAYERS").is_ok() {
Self::trace_k_weight_post_transpose(&k_weight_t);
}
// Cache with GGUF-style naming to match forward path (PMAT-805)
let mut total_bytes = 0;
let mut f32_count = 0;
let weight_pairs = [
(format!("blk.{layer_idx}.attn_q.weight"), q_weight_t),
(format!("blk.{layer_idx}.attn_k.weight"), k_weight_t),
(format!("blk.{layer_idx}.attn_v.weight"), v_weight_t),
];
for (cache_name, weight_data) in &weight_pairs {
if let Ok(bytes) = self.executor.load_weights(cache_name, weight_data) {
total_bytes += bytes;
f32_count += 1;
}
}
(total_bytes, f32_count)
}
/// PMAT-114: Trace K weight statistics before transpose (layer 0 only).
fn trace_k_weight_pre_transpose(k_weight: &[f32]) {
let k_sum: f32 = k_weight.iter().sum();
let k_mean = k_sum / k_weight.len() as f32;
let k_min = k_weight.iter().cloned().fold(f32::INFINITY, f32::min);
let k_max = k_weight.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
eprintln!("[PMAT-114] L0 K weight (pre-transpose): mean={:.6}, min={:.6}, max={:.6}, len={}",
k_mean, k_min, k_max, k_weight.len());
eprintln!(
"[PMAT-114] L0 K weight first10={:?}",
&k_weight[..10.min(k_weight.len())]
);
}
/// PMAT-114: Trace K weight statistics after transpose (layer 0 only).
fn trace_k_weight_post_transpose(k_weight_t: &[f32]) {
let k_sum: f32 = k_weight_t.iter().sum();
let k_mean = k_sum / k_weight_t.len() as f32;
eprintln!(
"[PMAT-114] L0 K weight (post-transpose): mean={:.6}, len={}",
k_mean,
k_weight_t.len()
);
eprintln!(
"[PMAT-114] L0 K weight_t first10={:?}",
&k_weight_t[..10.min(k_weight_t.len())]
);
}
/// Build per-layer norm weight name mappings.
///
/// Returns mappings for attn_norm, ffn_norm, and GH-279 QK norm weights.
fn layer_norm_mappings(layer_idx: usize) -> Vec<(Vec<String>, &'static str)> {
vec![
(
vec![
format!("model.layers.{layer_idx}.input_layernorm.weight"),
format!("layers.{layer_idx}.input_layernorm.weight"),
format!("blk.{layer_idx}.attn_norm.weight"),
],
"attn_norm.gamma",
),
(
vec![
format!("model.layers.{layer_idx}.post_attention_layernorm.weight"),
format!("layers.{layer_idx}.post_attention_layernorm.weight"),
format!("blk.{layer_idx}.ffn_norm.weight"),
],
"ffn_norm.gamma",
),
// GH-279: QK norm weights (Qwen3 per-head RMSNorm on Q and K)
(
vec![
format!("model.layers.{layer_idx}.self_attn.q_norm.weight"),
format!("blk.{layer_idx}.attn_q_norm.weight"),
],
"attn_q_norm.gamma",
),
(
vec![
format!("model.layers.{layer_idx}.self_attn.k_norm.weight"),
format!("blk.{layer_idx}.attn_k_norm.weight"),
],
"attn_k_norm.gamma",
),
]
}
/// Cache RMSNorm gamma weights for a single layer (always F32).
///
/// Uploads attn_norm, ffn_norm, and GH-279 QK norm weights.
///
/// # Returns
///
/// Total bytes uploaded.
fn cache_layer_norms(&mut self, layer_idx: usize) -> usize {
let prefix = format!("blk.{layer_idx}");
let norm_mappings = Self::layer_norm_mappings(layer_idx);
let mut total_bytes = 0;
for (patterns, suffix) in norm_mappings {
let patterns_ref: Vec<&str> = patterns.iter().map(String::as_str).collect();
if let Ok(src_name) = self.model.find_tensor_name(&patterns_ref) {
if let Ok(gamma) = self.model.get_tensor_f32(&src_name) {
let cache_name = format!("{prefix}.{suffix}");
if let Ok(bytes) = self.executor.cache_rmsnorm_gamma(&cache_name, &gamma) {
total_bytes += bytes;
}
}
}
}
total_bytes
}
/// Cache global weights: output norm + LM head.
///
/// # Returns
///
/// `(total_bytes, quantized_count)`
fn cache_global_weights(&mut self) -> (usize, usize) {
let mut total_bytes = 0;
let mut quantized_count = 0;
// Cache output norm
total_bytes += self.cache_output_norm();
// Cache LM head (may be quantized or F32)
let (b, q) = self.cache_lm_head();
total_bytes += b;
quantized_count += q;
(total_bytes, quantized_count)
}
/// Cache output norm weight.
///
/// # Returns
///
/// Bytes uploaded.
fn cache_output_norm(&mut self) -> usize {
let output_norm_patterns = [
"model.norm.weight",
"norm.weight",
"transformer.ln_f.weight",
"output_norm.weight",
];
if let Ok(src_name) = self.model.find_tensor_name(&output_norm_patterns) {
if let Ok(gamma) = self.model.get_tensor_f32(&src_name) {
if let Ok(bytes) = self
.executor
.cache_rmsnorm_gamma("output_norm.gamma", &gamma)
{
return bytes;
}
}
}
0
}
/// Cache LM head weight (may be quantized or F32).
///
/// # Returns
///
/// `(bytes_uploaded, quantized_count)`
fn cache_lm_head(&mut self) -> (usize, usize) {
let lm_head_patterns = [
"lm_head.weight",
"output.weight",
"token_embd.weight", // GGUF (tied embeddings)
];
let src_name = match self.model.find_tensor_name(&lm_head_patterns) {
Ok(name) => name,
Err(_) => return (0, 0),
};
let entry = match self.model.get_tensor(&src_name) {
Some(e) => e.dtype.clone(),
None => return (0, 0),
};
if let Some(qtype) = dtype_to_ggml_qtype(&entry) {
// Quantized LM head
self.cache_lm_head_quantized(&src_name, qtype)
} else {
// F32 LM head - store as quantized_weight_cache for compatibility
// The forward path will handle F32 appropriately
(self.cache_lm_head_f32(&src_name), 0)
}
}
/// Cache quantized LM head weight.
fn cache_lm_head_quantized(&mut self, src_name: &str, qtype: u32) -> (usize, usize) {
if let Ok(bytes) = self.model.get_tensor_bytes(src_name) {
if let Ok(size) = self.executor.load_quantized_weights_with_type(
"output.weight",
bytes,
qtype,
) {
return (size, 1);
}
}
(0, 0)
}
/// Cache F32 LM head weight (uploaded as raw bytes with qtype 0).
fn cache_lm_head_f32(&mut self, src_name: &str) -> usize {
if let Ok(w) = self.model.get_tensor_f32(src_name) {
// Upload F32 weights directly (no transpose needed for GEMV)
// SAFETY: f32 slice to u8 view - valid because f32 has no padding,
// alignment requirement of u8 is 1, and lifetime is preserved
let w_bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
w.as_ptr().cast::<u8>(),
w.len() * std::mem::size_of::<f32>(),
)
};
// Use qtype 0 to indicate F32 (handled specially in forward)
if let Ok(size) = self.executor.load_quantized_weights_with_type(
"output.weight",
w_bytes,
0,
) {
return size;
}
}
0
}
/// Pre-cache all model weights on GPU using native quantized format.
///
/// This uploads quantized weights (Q4K, Q6K, etc.) directly to GPU without
/// CPU dequantization, enabling fused dequant+matmul kernels for maximum
/// throughput (2x+ Ollama baseline per APR mandate).
///
/// # Returns
///
/// Total bytes uploaded to GPU.
fn pre_cache_weights(&mut self) -> Result<()> {
let (hidden_dim, num_layers, _num_heads, _num_kv_heads, intermediate_dim, _head_dim, kv_dim) =
self.extract_model_dims();
if hidden_dim == 0 || num_layers == 0 {
return Ok(()); // Non-transformer model, nothing to cache
}
let mut total_bytes = 0usize;
let mut quantized_count = 0usize;
let mut f32_weight_count = 0usize;
// Cache per-layer weights using GGUF naming convention
// This matches build_indexed_weights() expectations
for layer_idx in 0..num_layers {
let (b, q, f) = self.cache_layer_projections(layer_idx);
total_bytes += b;
quantized_count += q;
f32_weight_count += f;
// PMAT-113: Cache fused QKV from APR import (PMAT-101)
let (b, f) = self.cache_fused_qkv(layer_idx, hidden_dim, kv_dim);
total_bytes += b;
f32_weight_count += f;
// Weights are loaded via GGUF-style names (blk.{}.attn_output.weight etc.)
// in the first pass above. Biases are read directly from the model at
// inference time. This avoids duplicate GPU memory for 1.5B F32 models.
// Upload RMSNorm gamma weights (always F32)
total_bytes += self.cache_layer_norms(layer_idx);
}
let (b, q) = self.cache_global_weights();
total_bytes += b;
quantized_count += q;
// Build indexed weight lookup table for O(1) access during decode
// This is the key optimization that enables fast token generation
// GH-279: Always build + validate, even for F32-only models.
// The validation catches missing architecture-required weights.
{
// R-01 (Meyer DbC): "unknown" — don't pretend unidentified model is LLaMA.
let arch_name = self.model.metadata.model_type.as_deref().unwrap_or("unknown");
let arch = crate::gguf::ArchConstraints::from_architecture(arch_name);
self.executor
.build_indexed_weights(num_layers, |i| format!("blk.{i}"), &arch)
.map_err(|e| RealizarError::GpuError {
reason: format!(
"GH-279: Architecture weight validation failed: {e}"
),
})?;
eprintln!(
"[AprV2ModelCuda] Built indexed weights for {} layers (arch={})",
num_layers, arch_name
);
}
// Initialize workspace for zero-allocation forward pass
if let Err(e) = self.executor.init_workspace(hidden_dim, intermediate_dim) {
eprintln!("[AprV2ModelCuda] Warning: Could not init workspace: {e}");
}
// PMAT-113: Log both quantized and F32 weight counts
eprintln!(
"[AprV2ModelCuda] Pre-cached {} MB of weights on GPU ({} layers, {} quantized, {} F32 tensors)",
total_bytes / (1024 * 1024),
num_layers,
quantized_count,
f32_weight_count
);
Ok(())
}
}