use std::fmt;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::process::exit;
use clap::{Parser, Subcommand};
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
#[derive(Parser)]
#[command(
name = "llm-kernel-eval",
about = "Quality evaluation for llm-kernel modules"
)]
struct Cli {
#[command(subcommand)]
command: Commands,
#[arg(long, default_value = "markdown", global = true)]
format: String,
#[arg(long, default_value = "eval/datasets", global = true)]
datasets_dir: PathBuf,
#[arg(long, global = true)]
baseline: Option<PathBuf>,
}
#[derive(Subcommand)]
enum Commands {
Tokens,
Safety,
Embedding,
Search,
#[cfg(feature = "graph")]
Graph,
All,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct EvalReport {
module: String,
metrics: serde_json::Value,
passed: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct EvalSummary {
results: Vec<EvalReport>,
}
impl fmt::Display for EvalSummary {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.results.is_empty() {
writeln!(f, "No evaluations run.")?;
return Ok(());
}
let total = self.results.len();
let passed = self.results.iter().filter(|r| r.passed).count();
writeln!(f, "┌─────────────────────────────────────────────┐")?;
writeln!(f, "│ llm-kernel-eval — Quality Report │")?;
writeln!(f, "├─────────────────────────────────────────────┤")?;
for report in &self.results {
let status = if report.passed { "PASS" } else { "FAIL" };
writeln!(f, "│ {} │ {}", pad(&report.module, 15), status)?;
}
writeln!(f, "├─────────────────────────────────────────────┤")?;
writeln!(
f,
"│ Total: {} / {} passed ",
passed, total
)?;
writeln!(f, "└─────────────────────────────────────────────┘")?;
for report in &self.results {
writeln!(f)?;
writeln!(
f,
"## {} [{}]",
report.module,
if report.passed { "PASS" } else { "FAIL" }
)?;
if let serde_json::Value::Object(map) = &report.metrics {
for (key, val) in map {
match val {
serde_json::Value::Number(n) => {
if let Some(fv) = n.as_f64() {
writeln!(f, " {key}: {fv:.4}")?;
} else {
writeln!(f, " {key}: {n}")?;
}
}
_ => writeln!(f, " {key}: {val}")?,
}
}
}
}
Ok(())
}
}
fn pad(s: &str, width: usize) -> String {
let mut out = s.to_string();
while out.chars().count() < width {
out.push(' ');
}
out.truncate(width);
out
}
fn load_jsonl<T: DeserializeOwned>(path: &Path) -> anyhow::Result<Vec<T>> {
let file = File::open(path)?;
let reader = BufReader::new(file);
let mut items = Vec::new();
for line in reader.lines() {
let line = line?;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
items.push(serde_json::from_str(trimmed)?);
}
Ok(items)
}
mod eval_tokens {
use super::*;
#[derive(serde::Deserialize)]
struct Entry {
text: String,
#[allow(dead_code)]
model: String,
actual_tokens: usize,
#[allow(dead_code)]
category: String,
}
pub fn run(datasets_dir: &Path) -> EvalReport {
let path = datasets_dir.join("tokens.jsonl");
let entries: Vec<Entry> = match load_jsonl(&path) {
Ok(e) => e,
Err(e) => {
return EvalReport {
module: "tokens".into(),
metrics: serde_json::json!({"error": format!("Failed to load {}: {e}", path.display())}),
passed: false,
};
}
};
if entries.is_empty() {
return EvalReport {
module: "tokens".into(),
metrics: serde_json::json!({"error": "empty dataset"}),
passed: false,
};
}
let mut total_abs_err = 0.0_f64;
let mut max_err = 0.0_f64;
let mut within_3 = 0usize;
let mut within_10pct = 0usize;
for entry in &entries {
let estimated = llm_kernel::tokens::estimate_tokens(&entry.text);
let err = (estimated as f64 - entry.actual_tokens as f64).abs();
total_abs_err += err;
max_err = max_err.max(err);
if err <= 3.0 {
within_3 += 1;
}
if entry.actual_tokens > 0 && err / entry.actual_tokens as f64 <= 0.10 {
within_10pct += 1;
}
}
let n = entries.len();
let mae = total_abs_err / n as f64;
EvalReport {
module: "tokens".into(),
metrics: serde_json::json!({
"entries": n,
"mae": mae,
"max_error": max_err,
"pct_within_3": within_3 as f64 / n as f64 * 100.0,
"pct_within_10pct": within_10pct as f64 / n as f64 * 100.0,
}),
passed: mae < 10.0,
}
}
}
mod eval_safety {
use super::*;
#[derive(serde::Deserialize)]
struct Entry {
input: String,
expected_masked: String,
#[allow(dead_code)]
pattern_type: String,
}
pub fn run(datasets_dir: &Path) -> EvalReport {
let path = datasets_dir.join("safety.jsonl");
let entries: Vec<Entry> = match load_jsonl(&path) {
Ok(e) => e,
Err(e) => {
return EvalReport {
module: "safety".into(),
metrics: serde_json::json!({"error": format!("Failed to load {}: {e}", path.display())}),
passed: false,
};
}
};
if entries.is_empty() {
return EvalReport {
module: "safety".into(),
metrics: serde_json::json!({"error": "empty dataset"}),
passed: false,
};
}
let mut exact_match = 0usize;
let mut total_precision = 0.0_f64;
let mut total_recall = 0.0_f64;
let mut missed = 0usize;
for entry in &entries {
let actual = llm_kernel::safety::sanitize::mask_secrets(&entry.input);
if actual == entry.expected_masked {
exact_match += 1;
}
let (tp, fp, fn_) = char_metrics(&actual, &entry.expected_masked);
let precision = if tp + fp > 0 {
tp as f64 / (tp + fp) as f64
} else {
1.0
};
let recall = if tp + fn_ > 0 {
tp as f64 / (tp + fn_) as f64
} else {
1.0
};
total_precision += precision;
total_recall += recall;
if fn_ > 0 {
missed += 1;
}
}
let n = entries.len();
let avg_p = total_precision / n as f64;
let avg_r = total_recall / n as f64;
let avg_f1 = if avg_p + avg_r > 0.0 {
2.0 * avg_p * avg_r / (avg_p + avg_r)
} else {
0.0
};
EvalReport {
module: "safety".into(),
metrics: serde_json::json!({
"entries": n,
"exact_match_rate": exact_match as f64 / n as f64,
"avg_precision": avg_p,
"avg_recall": avg_r,
"avg_f1": avg_f1,
"missed_secrets": missed,
}),
passed: avg_f1 >= 0.90,
}
}
fn char_metrics(actual: &str, expected: &str) -> (usize, usize, usize) {
let a: Vec<char> = actual.chars().collect();
let e: Vec<char> = expected.chars().collect();
let len = a.len().max(e.len());
let mut tp = 0usize;
let mut fp = 0usize;
let mut fn_ = 0usize;
for i in 0..len {
match (a.get(i), e.get(i)) {
(Some(&c1), Some(&c2)) if c1 == c2 => tp += 1,
(Some(_), _) => fp += 1,
(None, Some(_)) => fn_ += 1,
(None, None) => {}
}
}
(tp, fp, fn_)
}
}
mod eval_embedding {
use super::*;
pub fn run() -> EvalReport {
let mut identity_pass = 0usize;
let mut identity_total = 0usize;
let mut ortho_pass = 0usize;
let mut ortho_total = 0usize;
let mut symmetry_pass = 0usize;
let mut symmetry_total = 0usize;
let mut range_pass = 0usize;
let mut range_total = 0usize;
let test_vectors: Vec<Vec<f32>> = vec![
vec![1.0, 0.0, 0.0, 0.0],
vec![0.0, 1.0, 0.0, 0.0],
vec![1.0, 1.0, 1.0, 1.0],
vec![-1.0, 2.0, -3.0, 4.0],
vec![0.1, 0.2, 0.3, 0.4],
];
for v in &test_vectors {
let sim = llm_kernel::embedding::cosine_similarity(v, v);
identity_total += 1;
if (sim - 1.0).abs() < 1e-10 {
identity_pass += 1;
}
}
let ortho_pairs: [(Vec<f32>, Vec<f32>); 3] = [
(vec![1.0, 0.0], vec![0.0, 1.0]),
(vec![1.0, 0.0, 0.0], vec![0.0, 1.0, 0.0]),
(vec![1.0, 0.0, 0.0], vec![0.0, 0.0, 1.0]),
];
for (a, b) in &ortho_pairs {
let sim = llm_kernel::embedding::cosine_similarity(a, b);
ortho_total += 1;
if sim.abs() < 1e-10 {
ortho_pass += 1;
}
}
let asym_pairs: [(Vec<f32>, Vec<f32>); 4] = [
(vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]),
(vec![-1.0, 0.5, 2.0], vec![3.0, -1.0, 0.0]),
(vec![0.1; 128], vec![0.2; 128]),
(vec![1.0, 0.0, 0.0, 0.0, 0.0], vec![0.0, 1.0, 0.0, 0.0, 0.0]),
];
for (a, b) in &asym_pairs {
let sim_ab = llm_kernel::embedding::cosine_similarity(a, b);
let sim_ba = llm_kernel::embedding::cosine_similarity(b, a);
symmetry_total += 1;
if (sim_ab - sim_ba).abs() < 1e-10 {
symmetry_pass += 1;
}
}
let range_pairs: [(Vec<f32>, Vec<f32>); 5] = [
(vec![1.0, 0.0], vec![0.0, 1.0]),
(vec![1.0, 1.0], vec![1.0, 1.0]),
(vec![1.0, 0.0], vec![-1.0, 0.0]),
(vec![3.0, 4.0], vec![6.0, 8.0]),
(vec![1.0, 2.0, 3.0], vec![-1.0, -2.0, -3.0]),
];
for (a, b) in &range_pairs {
let sim = llm_kernel::embedding::cosine_similarity(a, b);
range_total += 1;
if (-1.0 - 1e-10..=1.0 + 1e-10).contains(&sim) {
range_pass += 1;
}
}
let all_total = identity_total + ortho_total + symmetry_total + range_total;
let all_pass = identity_pass + ortho_pass + symmetry_pass + range_pass;
EvalReport {
module: "embedding".into(),
metrics: serde_json::json!({
"identity_accuracy": identity_pass as f64 / identity_total as f64,
"orthogonality_accuracy": ortho_pass as f64 / ortho_total as f64,
"symmetry_accuracy": symmetry_pass as f64 / symmetry_total as f64,
"range_accuracy": range_pass as f64 / range_total as f64,
"total_checks": all_total,
"total_passed": all_pass,
}),
passed: all_pass == all_total,
}
}
}
mod eval_search {
use super::*;
#[derive(serde::Deserialize, Clone)]
struct SearchItem {
id: String,
score: f32,
#[allow(dead_code)]
text: String,
}
#[derive(serde::Deserialize)]
struct Entry {
#[allow(dead_code)]
scenario: String,
result_sets: Vec<Vec<SearchItem>>,
k: u32,
expected_top5_ids: Vec<String>,
}
impl From<SearchItem> for llm_kernel::search::SearchResult {
fn from(item: SearchItem) -> Self {
Self {
id: item.id,
score: item.score,
text: item.text,
}
}
}
pub fn run(datasets_dir: &Path) -> EvalReport {
let path = datasets_dir.join("search.jsonl");
let entries: Vec<Entry> = match load_jsonl(&path) {
Ok(e) => e,
Err(e) => {
return EvalReport {
module: "search".into(),
metrics: serde_json::json!({"error": format!("Failed to load {}: {e}", path.display())}),
passed: false,
};
}
};
if entries.is_empty() {
return EvalReport {
module: "search".into(),
metrics: serde_json::json!({"error": "empty dataset"}),
passed: false,
};
}
let mut total_p5 = 0.0_f64;
let mut total_r5 = 0.0_f64;
let mut total_mrr = 0.0_f64;
for entry in &entries {
let sets: Vec<Vec<llm_kernel::search::SearchResult>> = entry
.result_sets
.iter()
.map(|rs| {
rs.iter()
.cloned()
.map(llm_kernel::search::SearchResult::from)
.collect()
})
.collect();
let fused = llm_kernel::search::rrf_fuse(&sets, entry.k);
let top5_ids: Vec<&str> = fused.iter().take(5).map(|r| r.id.as_str()).collect();
let relevant_in_top5 = top5_ids
.iter()
.filter(|id| entry.expected_top5_ids.iter().any(|e| e == **id))
.count();
total_p5 += relevant_in_top5 as f64 / top5_ids.len().max(1) as f64;
let recalled = entry
.expected_top5_ids
.iter()
.filter(|e| top5_ids.iter().any(|t| t == e))
.count();
total_r5 += recalled as f64 / entry.expected_top5_ids.len().max(1) as f64;
let mut mrr = 0.0_f64;
for (i, result) in fused.iter().enumerate() {
if entry.expected_top5_ids.contains(&result.id) {
mrr = 1.0 / (i as f64 + 1.0);
break;
}
}
total_mrr += mrr;
}
let n = entries.len();
let avg_p5 = total_p5 / n as f64;
let avg_r5 = total_r5 / n as f64;
let avg_mrr = total_mrr / n as f64;
EvalReport {
module: "search".into(),
metrics: serde_json::json!({
"entries": n,
"avg_precision_at_5": avg_p5,
"avg_recall_at_5": avg_r5,
"avg_mrr": avg_mrr,
}),
passed: avg_p5 >= 0.5 && avg_r5 >= 0.5,
}
}
}
#[cfg(feature = "graph")]
mod eval_graph {
use super::*;
use llm_kernel::graph::schema::init_graph_schema;
use llm_kernel::graph::search::search_nodes;
use llm_kernel::graph::store::{append_edge, upsert_node};
use llm_kernel::graph::types::GraphNode;
use rusqlite::Connection;
#[derive(serde::Deserialize)]
struct GraphEntry {
#[allow(dead_code)]
scenario: String,
nodes: Vec<NodeDef>,
edges: Vec<EdgeDef>,
query_type: String,
query_term: Option<String>,
expected_ids: Vec<String>,
}
#[derive(serde::Deserialize)]
struct NodeDef {
id: String,
node_type: String,
title: String,
body: String,
tags: Vec<String>,
projects: Vec<String>,
importance: f64,
created: String,
}
#[derive(serde::Deserialize)]
struct EdgeDef {
source: String,
target: String,
#[allow(dead_code)]
relation: String,
weight: f64,
}
fn mem_db() -> Connection {
let conn = Connection::open_in_memory().unwrap();
init_graph_schema(&conn).unwrap();
conn
}
fn seed_graph(conn: &Connection, entry: &GraphEntry) {
for n in &entry.nodes {
let node = GraphNode {
id: n.id.clone(),
node_type: n.node_type.clone(),
title: n.title.clone(),
body: n.body.clone(),
tags: n.tags.clone(),
projects: n.projects.clone(),
agents: vec![],
created: n.created.clone(),
updated: n.created.clone(),
importance: n.importance,
access_count: 0,
accessed_at: String::new(),
};
upsert_node(conn, &node).unwrap();
}
for e in &entry.edges {
let edge = llm_kernel::graph::types::GraphEdge {
id: format!("e-{}-{}", e.source, e.target),
source: e.source.clone(),
target: e.target.clone(),
relation: "related".into(),
weight: e.weight,
ts: "2025-01-01T00:00:00Z".into(),
};
append_edge(conn, &edge).unwrap();
}
}
pub fn run(datasets_dir: &Path) -> EvalReport {
let path = datasets_dir.join("graph.jsonl");
let entries: Vec<GraphEntry> = match load_jsonl(&path) {
Ok(e) => e,
Err(e) => {
return EvalReport {
module: "graph".into(),
metrics: serde_json::json!({"error": format!("Failed to load {}: {e}", path.display())}),
passed: false,
};
}
};
if entries.is_empty() {
return EvalReport {
module: "graph".into(),
metrics: serde_json::json!({"error": "empty dataset"}),
passed: false,
};
}
let mut total_precision = 0.0_f64;
let mut total_recall = 0.0_f64;
let mut total_f1 = 0.0_f64;
let mut scenarios_run = 0usize;
for entry in &entries {
let conn = mem_db();
seed_graph(&conn, entry);
let found_ids: Vec<String> = match entry.query_type.as_str() {
"fts_search" => {
let term = entry.query_term.as_deref().unwrap_or("");
search_nodes(&conn, term, 20)
.unwrap_or_default()
.into_iter()
.map(|n| n.id)
.collect()
}
_ => continue,
};
let relevant: std::collections::HashSet<&str> =
entry.expected_ids.iter().map(|s| s.as_str()).collect();
let found: std::collections::HashSet<&str> =
found_ids.iter().map(|s| s.as_str()).collect();
let tp = relevant.intersection(&found).count() as f64;
let precision = if found.is_empty() {
1.0
} else {
tp / found.len() as f64
};
let recall = if relevant.is_empty() {
1.0
} else {
tp / relevant.len() as f64
};
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
total_precision += precision;
total_recall += recall;
total_f1 += f1;
scenarios_run += 1;
}
if scenarios_run == 0 {
return EvalReport {
module: "graph".into(),
metrics: serde_json::json!({"error": "no supported scenarios"}),
passed: false,
};
}
let n = scenarios_run as f64;
EvalReport {
module: "graph".into(),
metrics: serde_json::json!({
"scenarios_run": scenarios_run,
"avg_precision": total_precision / n,
"avg_recall": total_recall / n,
"avg_f1": total_f1 / n,
}),
passed: total_f1 / n >= 0.80,
}
}
}
const HIGHER_IS_BETTER: &[&str] = &[
"pct_within_3",
"pct_within_10pct",
"exact_match_rate",
"avg_precision",
"avg_recall",
"avg_f1",
"avg_precision_at_5",
"avg_recall_at_5",
"avg_mrr",
"avg_recall_at_k",
"avg_filtered_precision",
"recall_2bit",
"recall_4bit",
"identity_accuracy",
"orthogonality_accuracy",
"symmetry_accuracy",
"range_accuracy",
];
const LOWER_IS_BETTER: &[&str] = &[
"mae",
"max_error",
"missed_secrets",
"degradation_2bit_vs_4bit",
];
fn compare_reports(current: &[EvalReport], baseline: &[EvalReport]) -> (Vec<String>, bool) {
let mut diffs = Vec::new();
let mut has_regression = false;
let base_metrics: std::collections::HashMap<&str, &serde_json::Map<String, serde_json::Value>> =
baseline
.iter()
.filter(|r| r.metrics.get("error").is_none())
.filter_map(|r| r.metrics.as_object().map(|m| (r.module.as_str(), m)))
.collect();
for report in current {
if report.metrics.get("error").is_some() {
continue;
}
let Some(base_obj) = base_metrics.get(report.module.as_str()) else {
continue;
};
let Some(cur_obj) = report.metrics.as_object() else {
continue;
};
for (key, cur_val) in cur_obj {
let Some(cur_f) = cur_val.as_f64() else {
continue;
};
let Some(base_val) = base_obj.get(key) else {
continue;
};
let Some(base_f) = base_val.as_f64() else {
continue;
};
let delta = cur_f - base_f;
if delta.abs() < 1e-10 {
continue;
}
let (arrow, is_regression) = if HIGHER_IS_BETTER.contains(&key.as_str()) {
if delta < 0.0 {
("↓", true)
} else {
("↑", false)
}
} else if LOWER_IS_BETTER.contains(&key.as_str()) {
if delta > 0.0 {
("↑ (worse)", true)
} else {
("↓ (better)", false)
}
} else {
("~", false)
};
if is_regression {
has_regression = true;
}
diffs.push(format!(
" {}.{}: {:.4} → {:.4} {arrow} ({:+.4}){}",
report.module,
key,
base_f,
cur_f,
delta,
if is_regression { " ⚠ REGRESSION" } else { "" },
));
}
}
(diffs, has_regression)
}
fn main() {
let cli = Cli::parse();
let mut reports = Vec::new();
let should_run = |cmd: &Commands| -> bool {
match &cli.command {
Commands::All => true,
c => std::mem::discriminant(c) == std::mem::discriminant(cmd),
}
};
if should_run(&Commands::Tokens) {
reports.push(eval_tokens::run(&cli.datasets_dir));
}
if should_run(&Commands::Safety) {
reports.push(eval_safety::run(&cli.datasets_dir));
}
if should_run(&Commands::Embedding) {
reports.push(eval_embedding::run());
}
if should_run(&Commands::Search) {
reports.push(eval_search::run(&cli.datasets_dir));
}
#[cfg(feature = "graph")]
if should_run(&Commands::Graph) {
reports.push(eval_graph::run(&cli.datasets_dir));
}
let summary = EvalSummary { results: reports };
let baseline: Option<EvalSummary> = cli.baseline.as_ref().and_then(|path| {
let data = std::fs::read_to_string(path).ok()?;
serde_json::from_str(&data).ok()
});
match cli.format.as_str() {
"json" => println!("{}", serde_json::to_string_pretty(&summary).unwrap()),
_ => print!("{summary}"),
}
if let Some(ref base) = baseline {
let (diffs, has_regression) = compare_reports(&summary.results, &base.results);
if diffs.is_empty() {
eprintln!("\n✅ No metric changes detected vs baseline.");
} else {
eprintln!("\n## Baseline Diff");
for line in &diffs {
eprintln!("{line}");
}
if has_regression {
eprintln!("\n❌ Regression detected — at least one metric worsened.");
exit(1);
} else {
eprintln!("\n✅ No regression — all changes are improvements or neutral.");
}
}
}
}