impl ValidatedEmbedding {
const MAX_ZERO_PCT: f32 = 50.0;
const MIN_L2_NORM: f32 = 1e-6;
const MIN_TOKEN_L2: f32 = 1e-6;
const SPOT_CHECK_PCTS: [usize; 3] = [10, 50, 90];
pub fn new(
data: Vec<f32>,
vocab_size: usize,
hidden_dim: usize,
) -> std::result::Result<Self, ContractValidationError> {
let name = "embedding";
let expected_len = vocab_size * hidden_dim;
if data.len() != expected_len {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-LAYOUT-CONTRACT-001".to_string(),
message: format!(
"Shape mismatch: got {} elements, expected {} ({}x{})",
data.len(),
expected_len,
vocab_size,
hidden_dim
),
});
}
let stats = TensorStats::compute(&data);
if stats.zero_pct() > Self::MAX_ZERO_PCT {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-001".to_string(),
message: format!(
"DENSITY FAILURE: {:.1}% zeros (max {}%). Data likely loaded from wrong offset!",
stats.zero_pct(),
Self::MAX_ZERO_PCT
),
});
}
if stats.nan_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} NaN values", stats.nan_count),
});
}
if stats.inf_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} Inf values", stats.inf_count),
});
}
if stats.l2_norm < Self::MIN_L2_NORM {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-003".to_string(),
message: "L2 norm ~0: tensor is effectively empty".to_string(),
});
}
if (stats.max - stats.min).abs() < 1e-10 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-003".to_string(),
message: "All values identical: tensor is constant".to_string(),
});
}
for pct in Self::SPOT_CHECK_PCTS {
let token_id = vocab_size * pct / 100;
let start = token_id * hidden_dim;
let end = start + hidden_dim;
if end <= data.len() {
let token_l2: f32 = data[start..end].iter().map(|x| x * x).sum::<f32>().sqrt();
if token_l2 < Self::MIN_TOKEN_L2 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-004".to_string(),
message: format!(
"Token {} ({}% of vocab) has L2={:.2e}: embedding data likely corrupted or offset",
token_id, pct, token_l2
),
});
}
}
}
Ok(Self {
data,
vocab_size,
hidden_dim,
stats,
})
}
#[must_use]
pub fn data(&self) -> &[f32] {
&self.data
}
#[must_use]
pub fn into_inner(self) -> Vec<f32> {
self.data
}
#[must_use]
pub fn vocab_size(&self) -> usize {
self.vocab_size
}
#[must_use]
pub fn hidden_dim(&self) -> usize {
self.hidden_dim
}
#[must_use]
pub fn stats(&self) -> &TensorStats {
&self.stats
}
}
#[derive(Debug, Clone)]
pub struct ValidatedWeight {
data: Vec<f32>,
out_dim: usize,
in_dim: usize,
name: String,
stats: TensorStats,
}
impl ValidatedWeight {
const MAX_ZERO_PCT: f32 = 80.0;
const MIN_L2_NORM: f32 = 1e-6;
pub fn new(
data: Vec<f32>,
out_dim: usize,
in_dim: usize,
name: &str,
) -> std::result::Result<Self, ContractValidationError> {
let expected_len = out_dim * in_dim;
if data.len() != expected_len {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-LAYOUT-CONTRACT-001".to_string(),
message: format!(
"Shape mismatch: got {} elements, expected {} ({}x{})",
data.len(),
expected_len,
out_dim,
in_dim
),
});
}
let stats = TensorStats::compute(&data);
if stats.zero_pct() > Self::MAX_ZERO_PCT {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-001".to_string(),
message: format!(
"DENSITY FAILURE: {:.1}% zeros (max {}%)",
stats.zero_pct(),
Self::MAX_ZERO_PCT
),
});
}
if stats.nan_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} NaN values", stats.nan_count),
});
}
if stats.inf_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} Inf values", stats.inf_count),
});
}
if stats.l2_norm < Self::MIN_L2_NORM {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-003".to_string(),
message: "L2 norm ~0: tensor is effectively empty".to_string(),
});
}
Ok(Self {
data,
out_dim,
in_dim,
name: name.to_string(),
stats,
})
}
#[must_use]
pub fn data(&self) -> &[f32] {
&self.data
}
#[must_use]
pub fn into_inner(self) -> Vec<f32> {
self.data
}
#[must_use]
pub fn out_dim(&self) -> usize {
self.out_dim
}
#[must_use]
pub fn in_dim(&self) -> usize {
self.in_dim
}
#[must_use]
pub fn name(&self) -> &str {
&self.name
}
#[must_use]
pub fn stats(&self) -> &TensorStats {
&self.stats
}
}
#[derive(Debug, Clone)]
pub struct ValidatedVector {
data: Vec<f32>,
name: String,
stats: TensorStats,
}
impl ValidatedVector {
pub fn new(
data: Vec<f32>,
expected_len: usize,
name: &str,
) -> std::result::Result<Self, ContractValidationError> {
if data.len() != expected_len {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-LAYOUT-CONTRACT-003".to_string(),
message: format!(
"Length mismatch: got {}, expected {}",
data.len(),
expected_len
),
});
}
let stats = TensorStats::compute(&data);
if stats.nan_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} NaN values", stats.nan_count),
});
}
if stats.inf_count > 0 {
return Err(ContractValidationError {
tensor_name: name.to_string(),
rule_id: "F-DATA-QUALITY-002".to_string(),
message: format!("Contains {} Inf values", stats.inf_count),
});
}
Ok(Self {
data,
name: name.to_string(),
stats,
})
}
#[must_use]
pub fn data(&self) -> &[f32] {
&self.data
}
#[must_use]
pub fn into_inner(self) -> Vec<f32> {
self.data
}
#[must_use]
pub fn name(&self) -> &str {
&self.name
}
#[must_use]
pub fn stats(&self) -> &TensorStats {
&self.stats
}
}
use crate::apr_transformer::{AprTransformer, AprTransformerConfig};
#[derive(Debug, Clone)]
pub struct ValidatedAprTransformer {
inner: AprTransformer,
}