fn apr_load_quantized_tensor(
apr: &crate::apr::MappedAprModel,
data: &[u8],
data_offset: usize,
names: &[&str],
in_dim: usize,
out_dim: usize,
transpose: bool,
) -> Result<OwnedQuantizedTensor> {
use crate::apr::MappedAprModel;
use crate::gguf::types::{APR_TYPE_Q4, APR_TYPE_Q8};
let (tensor, found_name) = names
.iter()
.find_map(|name| apr.find_tensor(name).map(|t| (t, *name)))
.ok_or_else(|| RealizarError::FormatError {
reason: format!("APR: tensor not found (tried: {})", names.join(", ")),
})?;
let start = data_offset + tensor.offset as usize;
let end = start + tensor.size as usize;
if end > data.len() {
return Err(RealizarError::FormatError {
reason: format!("APR: tensor {found_name} extends past EOF"),
});
}
let raw = &data[start..end];
let dtype = tensor.dtype.as_str();
let num_elements = in_dim * out_dim;
match dtype {
"q8" if !transpose => Ok(OwnedQuantizedTensor {
data: raw.to_vec(),
in_dim,
out_dim,
qtype: APR_TYPE_Q8,
}),
"q4" if !transpose => Ok(OwnedQuantizedTensor {
data: raw.to_vec(),
in_dim,
out_dim,
qtype: APR_TYPE_Q4,
}),
"q8" => {
let mut f32_data = crate::apr::dequant::dequantize_apr_q8(raw, num_elements);
f32_data = transpose_f32_matrix(&f32_data, in_dim, out_dim);
let f32_bytes: Vec<u8> = f32_data.iter().flat_map(|v| v.to_le_bytes()).collect();
Ok(OwnedQuantizedTensor {
data: f32_bytes,
in_dim,
out_dim,
qtype: 0,
})
},
"q4" => {
let mut f32_data = crate::apr::dequant::dequantize_apr_q4(raw, num_elements);
f32_data = transpose_f32_matrix(&f32_data, in_dim, out_dim);
let f32_bytes: Vec<u8> = f32_data.iter().flat_map(|v| v.to_le_bytes()).collect();
Ok(OwnedQuantizedTensor {
data: f32_bytes,
in_dim,
out_dim,
qtype: 0,
})
},
_ => {
let qtype = MappedAprModel::dtype_to_qtype(dtype);
Ok(OwnedQuantizedTensor {
data: raw.to_vec(),
in_dim,
out_dim,
qtype,
})
},
}
}
fn apr_load_f32_tensor(
apr: &crate::apr::MappedAprModel,
data: &[u8],
data_offset: usize,
names: &[&str],
) -> Result<Vec<f32>> {
let (tensor, found_name) = names
.iter()
.find_map(|name| apr.find_tensor(name).map(|t| (t, *name)))
.ok_or_else(|| RealizarError::FormatError {
reason: format!("APR: tensor not found (tried: {})", names.join(", ")),
})?;
let start = data_offset + tensor.offset as usize;
let end = start + tensor.size as usize;
if end > data.len() {
return Err(RealizarError::FormatError {
reason: format!("APR: tensor {found_name} extends past EOF"),
});
}
Ok(data[start..end]
.chunks_exact(4)
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
.collect())
}
fn apr_try_load_f32(
apr: &crate::apr::MappedAprModel,
data: &[u8],
data_offset: usize,
name: &str,
) -> Option<Vec<f32>> {
let tensor = apr.find_tensor(name)?;
let start = data_offset + tensor.offset as usize;
let end = start + tensor.size as usize;
if end > data.len() {
return None;
}
let raw = &data[start..end];
match tensor.dtype.as_str() {
"F16" => Some(
raw.chunks_exact(2)
.map(|c| half::f16::from_le_bytes([c[0], c[1]]).to_f32())
.collect(),
),
_ => Some(
raw.chunks_exact(4)
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
.collect(),
),
}
}
fn apr_infer_vocab_size(apr: &crate::apr::MappedAprModel) -> usize {
if let Some(v) = apr.metadata.vocab_size {
if v > 0 {
return v;
}
}
apr.tensors
.iter()
.find(|t| {
t.name.contains("embed_tokens")
|| t.name.contains("tok_embeddings")
|| t.name.contains("token_embd")
})
.and_then(|t| t.shape.first().copied())
.unwrap_or(0)
}
impl OwnedQuantizedModel {
pub fn from_apr(apr: &crate::apr::MappedAprModel) -> Result<Self> {
let t0 = std::time::Instant::now();
let data = apr.data();
let data_offset = apr.data_offset() as usize;
let vocab_size = apr_infer_vocab_size(apr);
let validated = ValidatedModelConfig::from_apr(apr, vocab_size)?;
let _proof = crate::contract_gate::validate_model_load_basic(
validated.architecture(),
validated.num_layers(),
validated.hidden_dim(),
validated.num_heads(),
validated.num_kv_heads(),
validated.intermediate_dim(),
validated.vocab_size(),
)
.map_err(crate::contract_gate::gate_error)?;
let mut config = validated.into_inner();
let transpose = config.constraints.needs_transpose();
let hidden_dim = config.hidden_dim;
let num_layers = config.num_layers;
let intermediate_dim = config.intermediate_dim;
let q_tensor_name = "model.layers.0.self_attn.q_proj.weight";
let gguf_q_name = "blk.0.attn_q.weight";
if let Some(q_tensor) = apr.find_tensor(q_tensor_name).or_else(|| apr.find_tensor(gguf_q_name)) {
if q_tensor.shape.len() == 2 {
let q_out_dim = q_tensor.shape[0];
let inferred_head_dim = if config.num_heads > 0 { q_out_dim / config.num_heads } else { 0 };
let default_head_dim = if config.num_heads > 0 { hidden_dim / config.num_heads } else { 0 };
if inferred_head_dim > 0 && inferred_head_dim != default_head_dim {
config.explicit_head_dim = Some(inferred_head_dim);
}
}
}
let token_embedding =
Self::load_apr_token_embedding(apr, data, data_offset, vocab_size, hidden_dim)?;
let q_dim = config.q_dim();
let kv_dim = config.kv_dim();
let mut layers = Vec::with_capacity(num_layers);
for layer_idx in 0..num_layers {
layers.push(Self::load_apr_layer(
apr,
data,
data_offset,
layer_idx,
hidden_dim,
q_dim,
kv_dim,
intermediate_dim,
transpose,
)?);
}
let output_norm_weight =
apr_load_f32_tensor(apr, data, data_offset, &["model.norm.weight", "output_norm.weight"])?;
let output_norm_bias = apr_try_load_f32(apr, data, data_offset, "model.norm.bias");
let lm_head_weight = apr_load_quantized_tensor(
apr, data, data_offset,
&["lm_head.weight", "output.weight"],
hidden_dim, vocab_size, transpose,
)?;
let lm_head_bias = apr_try_load_f32(apr, data, data_offset, "lm_head.bias");
let position_embedding =
apr_try_load_f32(apr, data, data_offset, "model.position_embedding.weight");
let load_ms = t0.elapsed().as_secs_f64() * 1000.0;
eprintln!(
"[GH-175] OwnedQuantizedModel::from_apr: {} layers loaded in {:.1}ms",
num_layers, load_ms
);
Ok(Self {
config,
token_embedding,
position_embedding,
layers,
encoder_layers: vec![],
encoder_output_norm_weight: None,
encoder_output_norm_bias: None,
output_norm_weight,
output_norm_bias,
lm_head_weight,
lm_head_bias,
#[cfg(feature = "cuda")]
cuda_executor: None,
#[cfg(feature = "cuda")]
cuda_kernel_count: std::sync::atomic::AtomicU64::new(0),
#[cfg(feature = "cuda")]
cached_weight_names: std::sync::Mutex::new(std::collections::HashSet::new()),
})
}
fn load_apr_token_embedding(
apr: &crate::apr::MappedAprModel,
data: &[u8],
data_offset: usize,
vocab_size: usize,
hidden_dim: usize,
) -> Result<Vec<f32>> {
let embed_name = apr
.tensors
.iter()
.find(|t| {
t.name.contains("embed_tokens")
|| t.name.contains("tok_embeddings")
|| t.name.contains("token_embd")
})
.map(|t| t.name.as_str())
.ok_or_else(|| RealizarError::FormatError {
reason: "APR: embedding tensor not found".to_string(),
})?;
let embed_tensor = apr.find_tensor(embed_name).ok_or_else(|| RealizarError::FormatError {
reason: "APR: embedding tensor not found".to_string(),
})?;
let embed_start = data_offset + embed_tensor.offset as usize;
let embed_end = embed_start + embed_tensor.size as usize;
if embed_end > data.len() {
return Err(RealizarError::FormatError {
reason: "APR: embedding tensor extends past EOF".to_string(),
});
}
let embed_data = &data[embed_start..embed_end];
dequantize_embedding(embed_data, embed_tensor.dtype.as_str(), vocab_size * hidden_dim)
}
#[allow(clippy::too_many_arguments)]
fn load_apr_layer(
apr: &crate::apr::MappedAprModel,
data: &[u8],
data_offset: usize,
layer_idx: usize,
hidden_dim: usize,
q_dim: usize,
kv_dim: usize,
intermediate_dim: usize,
transpose: bool,
) -> Result<OwnedQuantizedLayer> {
let hf_q = format!("model.layers.{layer_idx}.self_attn.q_proj.weight");
let hf_k = format!("model.layers.{layer_idx}.self_attn.k_proj.weight");
let hf_v = format!("model.layers.{layer_idx}.self_attn.v_proj.weight");
let hf_o = format!("model.layers.{layer_idx}.self_attn.o_proj.weight");
let hf_gate = format!("model.layers.{layer_idx}.mlp.gate_proj.weight");
let hf_up = format!("model.layers.{layer_idx}.mlp.up_proj.weight");
let hf_down = format!("model.layers.{layer_idx}.mlp.down_proj.weight");
let hf_attn_norm = format!("model.layers.{layer_idx}.input_layernorm.weight");
let hf_ffn_norm = format!("model.layers.{layer_idx}.post_attention_layernorm.weight");
let gguf_q = format!("blk.{layer_idx}.attn_q.weight");
let gguf_k = format!("blk.{layer_idx}.attn_k.weight");
let gguf_v = format!("blk.{layer_idx}.attn_v.weight");
let gguf_o = format!("blk.{layer_idx}.attn_output.weight");
let gguf_gate = format!("blk.{layer_idx}.ffn_gate.weight");
let gguf_up = format!("blk.{layer_idx}.ffn_up.weight");
let gguf_down = format!("blk.{layer_idx}.ffn_down.weight");
let gguf_attn_norm = format!("blk.{layer_idx}.attn_norm.weight");
let gguf_ffn_norm = format!("blk.{layer_idx}.ffn_norm.weight");
let q_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_q, &gguf_q], hidden_dim, q_dim, transpose)?;
let k_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_k, &gguf_k], hidden_dim, kv_dim, transpose)?;
let v_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_v, &gguf_v], hidden_dim, kv_dim, transpose)?;
let qkv_weight = OwnedQKVWeights::Separate {
q: q_weight,
k: k_weight,
v: v_weight,
};
let hf_q_bias = format!("model.layers.{layer_idx}.self_attn.q_proj.bias");
let hf_k_bias = format!("model.layers.{layer_idx}.self_attn.k_proj.bias");
let hf_v_bias = format!("model.layers.{layer_idx}.self_attn.v_proj.bias");
let gguf_q_bias = format!("blk.{layer_idx}.attn_q.bias");
let gguf_k_bias = format!("blk.{layer_idx}.attn_k.bias");
let gguf_v_bias = format!("blk.{layer_idx}.attn_v.bias");
let qkv_bias = apr_try_load_f32(apr, data, data_offset, &hf_q_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_q_bias))
.and_then(|q_b| {
let k_b = apr_try_load_f32(apr, data, data_offset, &hf_k_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_k_bias))?;
let v_b = apr_try_load_f32(apr, data, data_offset, &hf_v_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_v_bias))?;
let mut combined = Vec::with_capacity(q_b.len() + k_b.len() + v_b.len());
combined.extend_from_slice(&q_b);
combined.extend_from_slice(&k_b);
combined.extend_from_slice(&v_b);
Some(combined)
});
let o_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_o, &gguf_o], q_dim, hidden_dim, transpose)?;
let ffn_gate_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_gate, &gguf_gate], hidden_dim, intermediate_dim, transpose).ok();
let ffn_up_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_up, &gguf_up], hidden_dim, intermediate_dim, transpose)?;
let ffn_down_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_down, &gguf_down], intermediate_dim, hidden_dim, transpose)?;
let attn_norm_weight = apr_load_f32_tensor(apr, data, data_offset, &[&hf_attn_norm, &gguf_attn_norm])?;
let ffn_norm_weight = apr_load_f32_tensor(apr, data, data_offset, &[&hf_ffn_norm, &gguf_ffn_norm]).ok();
let hf_attn_norm_bias = format!("model.layers.{layer_idx}.input_layernorm.bias");
let hf_ffn_norm_bias = format!("model.layers.{layer_idx}.post_attention_layernorm.bias");
let hf_o_bias = format!("model.layers.{layer_idx}.self_attn.o_proj.bias");
let hf_up_bias = format!("model.layers.{layer_idx}.mlp.up_proj.bias");
let hf_down_bias = format!("model.layers.{layer_idx}.mlp.down_proj.bias");
let gguf_attn_norm_bias = format!("blk.{layer_idx}.attn_norm.bias");
let gguf_ffn_norm_bias = format!("blk.{layer_idx}.ffn_norm.bias");
let gguf_o_bias = format!("blk.{layer_idx}.attn_output.bias");
let gguf_up_bias = format!("blk.{layer_idx}.ffn_up.bias");
let gguf_down_bias = format!("blk.{layer_idx}.ffn_down.bias");
Ok(OwnedQuantizedLayer {
attn_norm_weight,
attn_norm_bias: apr_try_load_f32(apr, data, data_offset, &hf_attn_norm_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_attn_norm_bias)),
qkv_weight,
qkv_bias,
attn_output_weight: o_weight,
attn_output_bias: apr_try_load_f32(apr, data, data_offset, &hf_o_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_o_bias)),
ffn_norm_weight,
ffn_norm_bias: apr_try_load_f32(apr, data, data_offset, &hf_ffn_norm_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_ffn_norm_bias)),
ffn_gate_weight,
ffn_gate_bias: None,
ffn_up_weight,
ffn_up_bias: apr_try_load_f32(apr, data, data_offset, &hf_up_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_up_bias)),
ffn_down_weight,
ffn_down_bias: apr_try_load_f32(apr, data, data_offset, &hf_down_bias)
.or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_down_bias)),
attn_q_norm_weight: apr_try_load_f32(apr, data, data_offset,
&format!("model.layers.{layer_idx}.self_attn.q_norm.weight"))
.or_else(|| apr_try_load_f32(apr, data, data_offset,
&format!("blk.{layer_idx}.attn_q_norm.weight"))),
attn_k_norm_weight: apr_try_load_f32(apr, data, data_offset,
&format!("model.layers.{layer_idx}.self_attn.k_norm.weight"))
.or_else(|| apr_try_load_f32(apr, data, data_offset,
&format!("blk.{layer_idx}.attn_k_norm.weight"))),
})
}
}
#[cfg(all(test, not(target_arch = "wasm32")))]
mod gh478_per_layer_dequant_tests {
use crate::apr::{HEADER_SIZE, MAGIC, MappedAprModel};
use crate::gguf::types::{APR_TYPE_Q4, APR_TYPE_Q8};
use std::io::Write;
fn build_single_tensor_apr(name: &str, dtype_byte: u8, shape: &[u64], payload: &[u8]) -> Vec<u8> {
let metadata = b"{}";
let metadata_padded = metadata.len().div_ceil(64) * 64;
let mut entry = Vec::new();
entry.extend_from_slice(&(name.len() as u16).to_le_bytes());
entry.extend_from_slice(name.as_bytes());
entry.push(dtype_byte);
entry.push(shape.len() as u8);
for &d in shape {
entry.extend_from_slice(&d.to_le_bytes());
}
entry.extend_from_slice(&0u64.to_le_bytes()); entry.extend_from_slice(&(payload.len() as u64).to_le_bytes());
let tensor_index_offset = (HEADER_SIZE + metadata_padded) as u64;
let data_offset = tensor_index_offset + entry.len() as u64;
let total = data_offset as usize + payload.len();
let mut out = vec![0u8; total];
out[0..4].copy_from_slice(&MAGIC);
out[4] = 2; out[5] = 0; out[8..12].copy_from_slice(&1u32.to_le_bytes()); out[12..20].copy_from_slice(&(HEADER_SIZE as u64).to_le_bytes()); out[20..24].copy_from_slice(&(metadata.len() as u32).to_le_bytes()); out[24..32].copy_from_slice(&tensor_index_offset.to_le_bytes());
out[32..40].copy_from_slice(&data_offset.to_le_bytes());
out[HEADER_SIZE..HEADER_SIZE + metadata.len()].copy_from_slice(metadata);
let idx = tensor_index_offset as usize;
out[idx..idx + entry.len()].copy_from_slice(&entry);
let data_start = data_offset as usize;
out[data_start..data_start + payload.len()].copy_from_slice(payload);
out
}
fn write_tempfile(bytes: &[u8]) -> tempfile::NamedTempFile {
let mut f = tempfile::NamedTempFile::new().expect("tempfile");
f.write_all(bytes).expect("write apr");
f
}
#[test]
fn apr_q4_load_keeps_raw_bytes_not_f32_expansion() {
let in_dim = 32usize;
let out_dim = 4usize;
let num_elements = in_dim * out_dim;
let raw_q4 = vec![0u8; 4 * 18];
let file = write_tempfile(&build_single_tensor_apr(
"ffn_up.weight",
128, &[out_dim as u64, in_dim as u64],
&raw_q4,
));
let apr = MappedAprModel::from_path(file.path()).expect("load apr");
let tensor = super::apr_load_quantized_tensor(
&apr,
apr.data(),
apr.data_offset() as usize,
&["ffn_up.weight"],
in_dim,
out_dim,
false, )
.expect("load tensor");
assert_eq!(tensor.data.len(), raw_q4.len(),
"APR q4 loaded tensor must keep raw quantized bytes (got {}, expected {})",
tensor.data.len(), raw_q4.len());
assert_ne!(tensor.data.len(), num_elements * 4,
"APR q4 loaded tensor must NOT be F32-expanded ({}B = 4×{})",
num_elements * 4, num_elements);
assert_eq!(tensor.qtype, APR_TYPE_Q4, "qtype must tag as APR_TYPE_Q4");
assert_eq!(tensor.in_dim, in_dim);
assert_eq!(tensor.out_dim, out_dim);
}
#[test]
fn apr_q8_load_keeps_raw_bytes_not_f32_expansion() {
let in_dim = 32usize;
let out_dim = 4usize;
let num_elements = in_dim * out_dim;
let raw_q8 = vec![0u8; 4 + num_elements];
let file = write_tempfile(&build_single_tensor_apr(
"ffn_up.weight",
129, &[out_dim as u64, in_dim as u64],
&raw_q8,
));
let apr = MappedAprModel::from_path(file.path()).expect("load apr");
let tensor = super::apr_load_quantized_tensor(
&apr,
apr.data(),
apr.data_offset() as usize,
&["ffn_up.weight"],
in_dim,
out_dim,
false,
)
.expect("load tensor");
assert_eq!(tensor.data.len(), raw_q8.len(),
"APR q8 loaded tensor must keep raw quantized bytes");
assert_ne!(tensor.data.len(), num_elements * 4,
"APR q8 loaded tensor must NOT be F32-expanded");
assert_eq!(tensor.qtype, APR_TYPE_Q8, "qtype must tag as APR_TYPE_Q8");
}
#[test]
fn apr_q4_conv1d_transpose_still_dequants_to_f32() {
let in_dim = 32usize;
let out_dim = 4usize;
let num_elements = in_dim * out_dim;
let raw_q4 = vec![0u8; 4 * 18];
let file = write_tempfile(&build_single_tensor_apr(
"ffn_up.weight",
128,
&[out_dim as u64, in_dim as u64],
&raw_q4,
));
let apr = MappedAprModel::from_path(file.path()).expect("load apr");
let tensor = super::apr_load_quantized_tensor(
&apr,
apr.data(),
apr.data_offset() as usize,
&["ffn_up.weight"],
in_dim,
out_dim,
true, )
.expect("load tensor");
assert_eq!(tensor.data.len(), num_elements * 4,
"Conv1D (transpose=true) path keeps legacy F32 expansion");
assert_eq!(tensor.qtype, 0, "Conv1D path flattens qtype to F32");
}
#[test]
#[ignore]
fn gh478_real_model_load_stays_bounded() {
let path = match std::env::var("GH478_APR_Q4_MODEL") {
Ok(p) => p,
Err(_) => return, };
let apr = MappedAprModel::from_path(&path).expect("mmap apr");
let data = apr.data();
let data_offset = apr.data_offset() as usize;
let mut total_raw_bytes: u64 = 0;
let mut total_stored_bytes: u64 = 0;
let mut total_elements: u64 = 0;
let mut qtensor_count = 0usize;
for tensor in &apr.tensors {
let dtype = tensor.dtype.as_str();
if dtype != "q4" && dtype != "q8" {
continue;
}
if tensor.shape.len() != 2 {
continue; }
let out_dim = tensor.shape[0] as usize;
let in_dim = tensor.shape[1] as usize;
let raw_size = tensor.size;
let expected_f32_size = (in_dim * out_dim * 4) as u64;
let loaded = super::apr_load_quantized_tensor(
&apr, data, data_offset, &[tensor.name.as_str()],
in_dim, out_dim, false,
).expect("load tensor");
total_raw_bytes += raw_size;
total_stored_bytes += loaded.data.len() as u64;
total_elements += (in_dim * out_dim) as u64;
qtensor_count += 1;
assert_eq!(loaded.data.len() as u64, raw_size,
"tensor {}: data.len()={} raw_size={} expected_f32={} — regression!",
tensor.name, loaded.data.len(), raw_size, expected_f32_size);
}
let stored_gb = total_stored_bytes as f64 / 1e9;
let would_be_f32_gb = (total_elements * 4) as f64 / 1e9;
eprintln!(
"[GH-478] {} q-tensors stored={:.3} GB would-be-F32={:.3} GB ratio={:.1}×",
qtensor_count, stored_gb, would_be_f32_gb, would_be_f32_gb / stored_gb
);
assert!(qtensor_count > 0, "no q4/q8 tensors found — wrong model?");
assert_eq!(total_stored_bytes, total_raw_bytes,
"total stored bytes must equal on-disk raw quant bytes");
assert!(would_be_f32_gb > stored_gb * 2.0,
"falsification sanity: F32 expansion must be ≥2× the stored size");
}
fn read_rss_gb() -> f64 {
let status = std::fs::read_to_string("/proc/self/status").unwrap_or_default();
for line in status.lines() {
if let Some(rest) = line.strip_prefix("VmRSS:") {
let kb: f64 = rest.trim().trim_end_matches(" kB")
.parse().unwrap_or(0.0);
return kb / 1_048_576.0; }
}
0.0
}
}