#[cfg(feature = "inference")]
fn tokenize_for_eval(model_path: &Path, text: &str) -> Result<Vec<u32>> {
use realizar::apr::AprV2Model;
if let Some(tokenizer) = AprV2Model::load_tokenizer(model_path) {
let tokens = tokenizer.encode(text);
if !tokens.is_empty() {
return Ok(tokens);
}
}
if let Some(tokens) = AprV2Model::encode_text(model_path, text) {
if !tokens.is_empty() {
return Ok(tokens);
}
}
Err(CliError::ValidationFailed(
"No tokenizer found. Place tokenizer.json next to model file.".to_string(),
))
}
fn validate_token_count(tokens: &[u32]) -> Result<()> {
if tokens.len() < 2 {
return Err(CliError::ValidationFailed(
"Need at least 2 tokens for perplexity calculation".to_string(),
));
}
Ok(())
}
#[cfg(feature = "inference")]
fn calculate_apr_perplexity(
transformer: &realizar::apr_transformer::AprTransformer,
cache: &mut realizar::apr_transformer::AprKVCache,
tokens: &[u32],
vocab_size: usize,
) -> Result<(f32, f32)> {
let mut total_log_prob = 0.0f64;
let mut count = 0usize;
for (pos, window) in tokens.windows(2).enumerate() {
let input_token = window[0];
let target_token = window[1];
let logits = transformer
.forward_with_cache(input_token, cache, pos)
.map_err(|e| CliError::ValidationFailed(format!("Forward pass failed: {e}")))?;
let max_logit = logits.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let log_sum_exp: f64 = logits
.iter()
.map(|&l| ((l - max_logit) as f64).exp())
.sum::<f64>()
.ln()
+ max_logit as f64;
let target_idx = target_token as usize;
if target_idx < vocab_size {
let log_prob = logits[target_idx] as f64 - log_sum_exp;
total_log_prob += log_prob;
count += 1;
}
}
if count == 0 {
return Err(CliError::ValidationFailed(
"No valid tokens for perplexity calculation".to_string(),
));
}
let cross_entropy = (-total_log_prob / count as f64) as f32;
let perplexity = cross_entropy.exp();
Ok((perplexity, cross_entropy))
}
#[cfg(feature = "inference")]
fn calculate_gguf_perplexity(
model: &realizar::gguf::OwnedQuantizedModel,
tokens: &[u32],
) -> Result<(f32, f32)> {
use realizar::gguf::OwnedQuantizedKVCache;
let vocab_size = model.config().vocab_size;
let mut total_log_prob = 0.0f64;
let mut count = 0usize;
let mut cache = OwnedQuantizedKVCache::new(
model.config().num_layers,
model.config().hidden_dim,
tokens.len() + 1, );
for (pos, window) in tokens.windows(2).enumerate() {
let input_token = window[0];
let target_token = window[1];
let logits = model
.forward_single_with_cache(input_token, &mut cache, pos)
.map_err(|e| CliError::ValidationFailed(format!("Forward pass failed: {e}")))?;
let max_logit = logits.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let log_sum_exp: f64 = logits
.iter()
.map(|&l| ((l - max_logit) as f64).exp())
.sum::<f64>()
.ln()
+ max_logit as f64;
let target_idx = target_token as usize;
if target_idx < vocab_size {
let log_prob = logits[target_idx] as f64 - log_sum_exp;
total_log_prob += log_prob;
count += 1;
}
}
if count == 0 {
return Err(CliError::ValidationFailed(
"No valid tokens for perplexity calculation".to_string(),
));
}
let cross_entropy = (-total_log_prob / count as f64) as f32;
let perplexity = cross_entropy.exp();
Ok((perplexity, cross_entropy))
}
fn print_results(result: &EvalResult) {
output::section("Results");
println!();
let ppl_str = format!("{:.2}", result.perplexity);
if result.passed {
println!(
"{} {} {}",
"Perplexity:".white().bold(),
ppl_str.green().bold(),
format!("(PASS: <= {:.1})", result.threshold).green()
);
} else {
println!(
"{} {} {}",
"Perplexity:".white().bold(),
ppl_str.red().bold(),
format!("(FAIL: > {:.1})", result.threshold).red()
);
}
println!();
output::kv("Cross-entropy", format!("{:.4}", result.cross_entropy));
output::kv("Tokens evaluated", result.tokens_evaluated);
output::kv("Eval time", format!("{:.2}s", result.eval_time_secs));
println!();
let quality = if result.perplexity < 10.0 {
"Excellent (competitive with SotA)".green()
} else if result.perplexity < 15.0 {
"Good (usable quality)".green()
} else if result.perplexity < 20.0 {
"Acceptable (minimum threshold)".yellow()
} else if result.perplexity < 50.0 {
"Poor (likely undertrained)".red()
} else {
"Garbage (model broken)".red().bold()
};
output::kv("Quality", quality);
}
const SAMPLE_WIKITEXT: &str = r#"
The tower is 324 metres tall, about the same height as an 81-storey building,
and the tallest structure in Paris. Its base is square, measuring 125 metres
on each side. During its construction, the Eiffel Tower surpassed the Washington
Monument to become the tallest man-made structure in the world, a title it held
for 41 years until the Chrysler Building in New York City was finished in 1930.
Due to the addition of a broadcasting aerial at the top of the tower in 1957,
it is now taller than the Chrysler Building by 5.2 metres. Excluding transmitters,
the Eiffel Tower is the second tallest free-standing structure in France after
the Millau Viaduct.
"#;
const SAMPLE_LAMBADA: &str = r#"
She walked into the room and saw her old friend sitting by the window. After
all these years, she finally understood why he had left. The answer was simple:
he had been afraid of what might happen if he stayed. But now, looking at him,
she realized that fear had cost them both dearly. The time they had lost could
never be recovered. All she could do was sit beside him and hope that somehow,
they could find a way to start again.
"#;
#[cfg(test)]
#[path = "eval_tests.rs"]
mod tests;