1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
impl GpuModel {
/// IMP-1008: Forward single block without weight cloning
///
/// Uses interior mutability pattern to avoid cloning weights on each matmul.
/// This method takes `&self` instead of `&mut self`.
///
/// # Errors
///
/// Returns error if forward pass fails.
#[cfg(feature = "cuda")]
pub fn forward_block_refcell(
&self,
input: &[f32],
block_idx: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
// Phase 21 Debug: trace first forward call only
static DEBUG_COUNTER: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
let call_count = DEBUG_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let debug_this_call = block_idx == 0 && call_count == 0; // Only first call to block 0
// Extract config values (Copy types, no borrow conflict)
let hidden_dim = self.config.hidden_dim;
let num_heads = self.config.num_heads;
let head_dim = self.config.head_dim();
let kv_dim = self.config.kv_dim();
let qkv_dim = self.config.qkv_dim();
let intermediate_dim = self.config.intermediate_dim;
let eps = self.config.eps;
let num_kv_heads = self.config.num_kv_heads;
if debug_this_call {
eprintln!(
"[PHASE21] forward_block_refcell START block_idx={}",
block_idx
);
eprintln!(
"[PHASE21] input L2: {:.4}",
input.iter().map(|x| x * x).sum::<f32>().sqrt()
);
}
// IMP-1008: No cloning! Direct reference to weights
// Pre-attention layer norm (static function avoids &self borrow)
let normed = Self::layer_norm_static(
input,
&self.block_weights[block_idx].attn_norm_weight,
&self.block_weights[block_idx].attn_norm_bias,
hidden_dim,
eps,
);
// QKV projection - NO CLONE!
let mut qkv = self.matmul_refcell(
&normed,
&self.block_weights[block_idx].qkv_weight,
1,
hidden_dim,
qkv_dim,
)?;
// F-REGR-231 FIX: Add QKV bias (critical for correct attention)
// The GGUF path applies bias after matmul, APR must do the same
let qkv_bias = &self.block_weights[block_idx].qkv_bias;
if debug_this_call {
eprintln!(
"[PHASE21-BIAS] qkv_bias len: {}, qkv len: {}, bias first 5: {:?}",
qkv_bias.len(),
qkv.len(),
&qkv_bias[..5.min(qkv_bias.len())]
);
}
if !qkv_bias.is_empty() && qkv_bias.len() == qkv.len() {
for (q, b) in qkv.iter_mut().zip(qkv_bias.iter()) {
*q += *b;
}
}
if debug_this_call {
eprintln!(
"[PHASE21] QKV L2: {:.4}",
qkv.iter().map(|x| x * x).sum::<f32>().sqrt()
);
// F-REGR-231 DEBUG: Show Q values after bias
eprintln!(
"[PHASE21] Q after bias first 5: {:?}",
&qkv[..5.min(qkv.len())]
);
}
// Get current position BEFORE caching (Phase 21)
let (cached_k_ref, _) = kv_cache.get_valid(block_idx);
let current_pos = cached_k_ref.len() / kv_dim;
// Phase 21: Apply RoPE to Q and K BEFORE caching
// Without RoPE, attention has no position information and produces garbage
let rope_theta = self.config.rope_theta;
Self::apply_rope_inline(
&mut qkv[0..hidden_dim],
num_heads,
head_dim,
rope_theta,
current_pos,
);
Self::apply_rope_inline(
&mut qkv[hidden_dim..hidden_dim + kv_dim],
num_kv_heads,
head_dim,
rope_theta,
current_pos,
);
// Split QKV (GQA: K/V have kv_dim, not hidden_dim) - after RoPE
let q = qkv[0..hidden_dim].to_vec();
let k_new = qkv[hidden_dim..hidden_dim + kv_dim].to_vec();
let v_new = qkv[hidden_dim + kv_dim..].to_vec();
// F-REGR-231 DEBUG: Show K and V values after bias and RoPE
if debug_this_call {
eprintln!(
"[PHASE21] K after RoPE first 5: {:?}",
&k_new[..5.min(k_new.len())]
);
eprintln!("[PHASE21] V first 5: {:?}", &v_new[..5.min(v_new.len())]);
}
// Get cached K/V and clone to avoid borrow issues with kv_cache
let (cached_k, cached_v) = kv_cache.get_valid(block_idx);
let keys_cached = cached_k.to_vec();
let vals_cached = cached_v.to_vec();
// Append new K/V (with RoPE applied) to cache
kv_cache.append(block_idx, &k_new, &v_new);
// Build full K/V (cached + new)
let kv_len = keys_cached.len() / kv_dim + 1;
let mut full_k = keys_cached;
full_k.extend_from_slice(&k_new);
let mut full_v = vals_cached;
full_v.extend_from_slice(&v_new);
// GQA attention (IMP-089): static method to avoid borrow conflicts
let attn_output = Self::gqa_multihead_attention(
&q,
&full_k,
&full_v,
kv_len,
num_heads,
num_kv_heads,
head_dim,
);
if debug_this_call {
eprintln!(
"[PHASE21] attn_output L2: {:.4}",
attn_output.iter().map(|x| x * x).sum::<f32>().sqrt()
);
}
// Output projection - NO CLONE!
let attn_proj = self.matmul_refcell(
&attn_output,
&self.block_weights[block_idx].out_weight,
1,
hidden_dim,
hidden_dim,
)?;
// Add residual and bias
let out_bias = &self.block_weights[block_idx].out_bias;
let post_attn: Vec<f32> = input
.iter()
.zip(attn_proj.iter())
.zip(out_bias.iter())
.map(|((&i, &a), &b)| i + a + b)
.collect();
// FFN with layer norm (static function)
let ffn_normed = Self::layer_norm_static(
&post_attn,
&self.block_weights[block_idx].ffn_norm_weight,
&self.block_weights[block_idx].ffn_norm_bias,
hidden_dim,
eps,
);
// FFN: MoE dispatch when experts present, SwiGLU/GELU otherwise
let output: Vec<f32> = if let Some(ref moe) = self.block_weights[block_idx].moe_experts {
// ALB-010: MoE forward — route to top-k experts + shared expert
let moe_out = super::moe_dispatch::moe_forward_token(&ffn_normed, moe, hidden_dim);
// Residual connection (MoE has no separate bias)
post_attn
.iter()
.zip(moe_out.iter())
.map(|(&h, &m)| h + m)
.collect()
} else {
// Dense FFN: SwiGLU when gate weight exists, otherwise GELU
let fc1_activated: Vec<f32> = if let Some(ref gate_weight) =
self.block_weights[block_idx].ffn_gate_weight
{
// SwiGLU: silu(gate(x)) * up(x)
let up_out = self.matmul_refcell(
&ffn_normed,
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
)?;
let gate_out =
self.matmul_refcell(&ffn_normed, gate_weight, 1, hidden_dim, intermediate_dim)?;
up_out
.iter()
.zip(gate_out.iter())
.map(|(&u, &g)| {
let silu_g = g / (1.0 + (-g).exp());
silu_g * u
})
.collect()
} else {
// Standard GELU FFN
let fc1_out = self.matmul_refcell(
&ffn_normed,
&self.block_weights[block_idx].ffn_fc1_weight,
1,
hidden_dim,
intermediate_dim,
)?;
let ffn_fc1_bias = &self.block_weights[block_idx].ffn_fc1_bias;
fc1_out
.iter()
.zip(ffn_fc1_bias.iter())
.map(|(&x, &b)| {
let x_b = x + b;
x_b * 0.5 + x_b * 0.5 * (0.797_884_6 * (x_b + 0.044_715 * x_b.powi(3))).tanh()
})
.collect()
};
// FFN FC2 (down projection) - NO CLONE!
let fc2_out = self.matmul_refcell(
&fc1_activated,
&self.block_weights[block_idx].ffn_fc2_weight,
1,
intermediate_dim,
hidden_dim,
)?;
// Add bias and residual
let ffn_fc2_bias = &self.block_weights[block_idx].ffn_fc2_bias;
post_attn
.iter()
.zip(fc2_out.iter())
.zip(ffn_fc2_bias.iter())
.map(|((&h, &f), &b)| h + f + b)
.collect()
};
if debug_this_call {
eprintln!(
"[PHASE21] block output L2: {:.4}",
output.iter().map(|x| x * x).sum::<f32>().sqrt()
);
}
Ok(output)
}
/// IMP-1008: Full incremental forward pass without weight cloning
///
/// Uses interior mutability pattern throughout for zero-clone operation.
///
/// # Errors
///
/// Returns error if forward pass fails.
#[cfg(feature = "cuda")]
pub fn forward_refcell(
&self,
token_id: usize,
kv_cache: &mut StreamingKVCache,
) -> Result<Vec<f32>> {
// Phase 21: Debug first call only
static FWD_COUNTER: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
let fwd_count = FWD_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let debug_this_fwd = fwd_count == 0;
if token_id >= self.config.vocab_size {
return Err(RealizarError::InvalidShape {
reason: format!(
"Token ID {} out of bounds (vocab_size={})",
token_id, self.config.vocab_size
),
});
}
let hidden_dim = self.config.hidden_dim;
// Embed single token
let offset = token_id * hidden_dim;
let mut hidden = self.embedding_weights[offset..offset + hidden_dim].to_vec();
// Process through all blocks - NO CLONE!
for block_idx in 0..self.config.num_layers {
hidden = self.forward_block_refcell(&hidden, block_idx, kv_cache)?;
}
// Final layer norm
hidden = self.layer_norm_refcell(&hidden, &self.final_norm_weight, &self.final_norm_bias);
// LM head projection
let lm_head_elements = hidden_dim * self.config.vocab_size;
let output = if exceeds_gpu_buffer_limit(lm_head_elements) {
// CPU path with transposed weights + SIMD + fused bias
cpu_matmul_transposed_simd(
&hidden,
&self.lm_head_weight_t,
&self.lm_head_bias,
hidden_dim,
self.config.vocab_size,
)
} else {
// GPU path - NO CLONE!
let vocab_size = self.config.vocab_size;
let logits =
self.matmul_refcell(&hidden, &self.lm_head_weight, 1, hidden_dim, vocab_size)?;
// Add bias
logits
.into_iter()
.zip(self.lm_head_bias.iter())
.map(|(l, &b)| l + b)
.collect()
};
if debug_this_fwd {
// Find argmax
let (argmax_idx, argmax_val) = output
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or((0, &0.0));
eprintln!(
"[PHASE21] forward_refcell: final hidden L2: {:.4}",
hidden.iter().map(|x| x * x).sum::<f32>().sqrt()
);
eprintln!(
"[PHASE21] forward_refcell: logits argmax: {} (val: {:.4})",
argmax_idx, argmax_val
);
}
Ok(output)
}
/// IMP-1008: Layer norm with RefCell pattern (takes &self)
#[cfg(feature = "cuda")]
fn layer_norm_refcell(&self, input: &[f32], weight: &[f32], bias: &[f32]) -> Vec<f32> {
Self::layer_norm_static(input, weight, bias, self.config.hidden_dim, self.config.eps)
}
/// IMP-1008: Generate tokens without weight cloning
///
/// Uses interior mutability pattern for zero-clone inference.
///
/// # Errors
///
/// Returns error if generation fails.
#[cfg(feature = "cuda")]
pub fn generate_refcell(
&self,
prompt: &[usize],
config: &GpuGenerateConfig,
) -> Result<Vec<usize>> {
if prompt.is_empty() {
return Err(RealizarError::InvalidShape {
reason: "Prompt cannot be empty".to_string(),
});
}
let num_kv_heads = self.config.num_kv_heads;
let head_dim = self.config.head_dim();
let max_seq_len = prompt.len() + config.max_tokens;
// Initialize KV cache
let mut kv_cache =
StreamingKVCache::new(self.config.num_layers, max_seq_len, num_kv_heads, head_dim);
let mut tokens = prompt.to_vec();
// F-REGR-231 FIX: Process prefill correctly
// Process all but last prompt token to populate KV cache (discard logits)
// Then process last token to get logits for first generation
let prompt_len = prompt.len();
for &token_id in &prompt[..prompt_len.saturating_sub(1)] {
let _ = self.forward_refcell(token_id, &mut kv_cache)?;
}
// Process last prompt token to get logits for first generated token
let last_prompt_token = prompt[prompt_len - 1];
let mut current_logits = self.forward_refcell(last_prompt_token, &mut kv_cache)?;
// F-REGR-231 DEBUG: Show logits from last prompt token
let argmax = current_logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(idx, _)| idx);
let argmax_val = current_logits.get(argmax).copied().unwrap_or(0.0);
eprintln!(
"[PHASE21-GEN] Last prompt token: {}, logits argmax: {} (val: {:.4}), top5 logits: {:?}",
last_prompt_token,
argmax,
argmax_val,
{
let mut indexed: Vec<(usize, f32)> = current_logits.iter().cloned().enumerate().collect();
indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
indexed.into_iter().take(5).collect::<Vec<_>>()
}
);
// Generate new tokens
for _ in 0..config.max_tokens {
// Sample next token (greedy when temperature=0, otherwise top-k)
let next_token = if config.temperature == 0.0 || config.top_k == 1 {
// Greedy decoding
current_logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map_or(0, |(idx, _)| idx)
} else {
// Top-k sampling with temperature
Self::sample_topk_generate(¤t_logits, config.temperature, config.top_k)
};
tokens.push(next_token);
// Check for stop tokens
if config.stop_tokens.contains(&next_token) {
break;
}
// F-REGR-231: Get logits for next iteration by processing the new token
current_logits = self.forward_refcell(next_token, &mut kv_cache)?;
}
Ok(tokens)
}
}