1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//! PMAT-701 Bug B: Q4K-native frozen-teacher path for `apr distill --backend cuda`.
//!
//! Contract: `contracts/cuda-q4k-frozen-teacher-v1.yaml`
//!
//! # What this fixes
//!
//! The legacy `CudaTrainerTeacher` (aprender-train-distill) dequantizes Q4K
//! teacher weights to F32 at GPU upload. A 7B Q4K teacher (4 GB on disk)
//! inflates to ~28 GB of F32 GPU memory, which trips the Linux OOM killer
//! once you add the student's F32 + grads + Adam + activations footprint.
//!
//! [`RealizarQ4KTeacher`] takes a different path: it wraps realizar's
//! existing inference-time `OwnedQuantizedModelCuda`, which keeps weights
//! in Q4K on the GPU and uses Q4K-native CUDA kernels for forward GEMM
//! (the same path validated by `apr run`). Teacher GPU footprint stays at
//! ~4 GB for the 7B model — within budget on Grace Blackwell GB10.
//!
//! # When this path is used
//!
//! [`run_cuda_backend`](super::distill::run_cuda_backend) inspects the
//! teacher .apr's tensor dtype histogram. Any presence of
//! [`TensorDType::Q4K`] or [`TensorDType::Q6K`] routes the teacher
//! provider here. F32/F16/BF16 teachers continue to use
//! `CudaTrainerTeacher` (the dequant path is harmless for those types
//! since they don't inflate).
//!
//! # Falsifier mapping
//!
//! - FT-Q4K-TEACHER-001: no `[PMAT-333] Dequantizing` log line in the
//! teacher load path (the dequant happens inside realizar's
//! `OwnedQuantizedModel::from_apr` only for non-quantized tensors,
//! not for Q4K blocks).
//! - FT-Q4K-TEACHER-002: peak GPU memory for 7B Q4K teacher <= 6 GB.
//! - FT-Q4K-TEACHER-003: forward parity with `apr run` (same kernel path).
//! - FT-Q4K-TEACHER-005: `apr distill --epochs 1` completes on GB10.
#![cfg(all(feature = "cuda", feature = "training", feature = "inference"))]
use std::path::Path;
use entrenar_common::{EntrenarError, Result};
use entrenar_distill::teacher_provider::TeacherLogitsProvider;
use realizar::apr::MappedAprModel;
use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda};
/// Teacher provider backed by realizar's CUDA inference path.
///
/// Constructed from a frozen-on-disk APR teacher (typically Q4K-quantized).
/// Holds an `OwnedQuantizedModelCuda` whose weights live on the GPU in their
/// native quantization format — no dequantization to F32 at upload, no
/// gradient/optimizer state.
///
/// # Memory budget
///
/// For a 7B Q4K teacher on Grace Blackwell GB10:
/// - On-disk: ~4 GB (Q4K super-blocks)
/// - GPU resident after `preload_weights_gpu()`: ~4 GB (same Q4K blocks)
/// - Forward activations per token: ~1-2 GB (transient)
///
/// vs. the legacy `CudaTrainerTeacher` which would consume ~28 GB F32.
pub struct RealizarQ4KTeacher {
cuda_model: OwnedQuantizedModelCuda,
vocab_size: usize,
}
impl RealizarQ4KTeacher {
/// Load a Q4K teacher from an APR checkpoint and stage it on the GPU.
///
/// # Errors
///
/// Returns `EntrenarError::Internal` if the APR file cannot be mapped,
/// the quantized model construction fails, or CUDA initialization fails.
/// Per `cuda-q4k-frozen-teacher-v1.yaml` falsifier FT-Q4K-TEACHER-005,
/// the load succeeds on Grace Blackwell GB10 for 7B Q4K teachers when
/// `cuda-unified-memory-allocator-v1.yaml` (Bug A) is also in effect.
pub fn from_apr_path(model_path: &Path) -> Result<Self> {
let mapped =
MappedAprModel::from_path(model_path).map_err(|e| EntrenarError::Internal {
message: format!(
"RealizarQ4KTeacher: MappedAprModel::from_path({}): {e}",
model_path.display()
),
})?;
let quantized =
OwnedQuantizedModel::from_apr(&mapped).map_err(|e| EntrenarError::Internal {
message: format!("RealizarQ4KTeacher: OwnedQuantizedModel::from_apr: {e}"),
})?;
let vocab_size = quantized.config().vocab_size;
let mut cuda_model =
OwnedQuantizedModelCuda::new(quantized, 0).map_err(|e| EntrenarError::Internal {
message: format!("RealizarQ4KTeacher: OwnedQuantizedModelCuda::new: {e}"),
})?;
// Pre-upload all Q4K weights to GPU. This is what makes the next
// forward pass fast; without it, weights stream lazily and per-batch
// latency dominates. Failure here is non-fatal — realizar falls back
// to on-demand upload.
match cuda_model.preload_weights_gpu() {
Ok(bytes) => {
eprintln!(
"[PMAT-701] RealizarQ4KTeacher: pre-uploaded {} MB to GPU (Q4K-native, no F32 dequant)",
bytes / (1024 * 1024)
);
}
Err(e) => {
eprintln!(
"[PMAT-701] RealizarQ4KTeacher: weight preload failed ({e}); falling back to on-demand upload"
);
}
}
Ok(Self {
cuda_model,
vocab_size,
})
}
}
impl TeacherLogitsProvider for RealizarQ4KTeacher {
fn vocab_size(&self) -> usize {
self.vocab_size
}
fn logits_for_batch(&mut self, input_ids: &[Vec<u32>]) -> Result<Vec<Vec<f32>>> {
let mut out = Vec::with_capacity(input_ids.len());
for ids in input_ids {
let logits =
self.cuda_model
.forward_cuda(ids)
.map_err(|e| EntrenarError::Internal {
message: format!("RealizarQ4KTeacher.forward_cuda: {e}"),
})?;
if logits.len() != self.vocab_size {
return Err(EntrenarError::Internal {
message: format!(
"RealizarQ4KTeacher: forward_cuda returned {} logits, expected {} \
(vocab mismatch — teacher config does not match the on-disk checkpoint)",
logits.len(),
self.vocab_size
),
});
}
out.push(logits);
}
Ok(out)
}
}