use crate::eval::task_evaluator::TaskEvalResult;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
fn cached_git_commit() -> Option<String> {
static COMMIT: OnceLock<Option<String>> = OnceLock::new();
COMMIT
.get_or_init(|| {
std::env::var("ANNO_GIT_COMMIT").ok().or_else(|| {
std::process::Command::new("git")
.args(["rev-parse", "--short", "HEAD"])
.output()
.ok()
.and_then(|o| {
if o.status.success() {
String::from_utf8(o.stdout)
.ok()
.map(|s| s.trim().to_string())
} else {
None
}
})
})
})
.clone()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalHistoryEntry {
pub timestamp: String,
pub backend: String,
pub dataset: String,
pub task: String,
pub seed: u64,
pub f1: Option<f64>,
pub precision: Option<f64>,
pub recall: Option<f64>,
pub n: usize,
pub duration_ms: Option<f64>,
pub error: Option<String>,
pub metadata: Option<String>,
}
impl From<&TaskEvalResult> for EvalHistoryEntry {
fn from(result: &TaskEvalResult) -> Self {
let f1 = result.metrics.get("f1").copied();
let precision = result.metrics.get("precision").copied();
let recall = result.metrics.get("recall").copied();
Self {
timestamp: chrono::Utc::now().to_rfc3339(),
backend: result.backend.clone(),
dataset: result.dataset.name().to_string(),
task: format!("{:?}", result.task),
seed: result.seed,
f1,
precision,
recall,
n: result.num_examples,
duration_ms: result.duration_ms,
error: result.error.clone(),
metadata: serde_json::to_string(result).ok(),
}
}
}
pub struct EvalHistory {
jsonl_path: PathBuf,
sqlite_path: Option<PathBuf>,
}
impl EvalHistory {
pub fn new(jsonl_path: impl AsRef<Path>) -> std::io::Result<Self> {
let jsonl_path = jsonl_path.as_ref().to_path_buf();
if let Some(parent) = jsonl_path.parent() {
std::fs::create_dir_all(parent)?;
}
let sqlite_path = jsonl_path
.parent()
.map(|p| p.join("eval-history.db"))
.or_else(|| Some(PathBuf::from("eval-history.db")));
if let Some(ref db_path) = sqlite_path {
Self::init_sqlite(db_path)?;
}
Ok(Self {
jsonl_path,
sqlite_path,
})
}
pub fn append_result(&self, result: &TaskEvalResult) -> std::io::Result<()> {
let entry = EvalHistoryEntry::from(result);
self.append_entry(&entry)
}
pub fn append_entry(&self, entry: &EvalHistoryEntry) -> std::io::Result<()> {
self.append_jsonl(entry)?;
if let Some(ref db_path) = self.sqlite_path {
self.insert_sqlite(entry, db_path)?;
}
Ok(())
}
fn append_jsonl(&self, entry: &EvalHistoryEntry) -> std::io::Result<()> {
let mut file = OpenOptions::new()
.create(true)
.append(true)
.open(&self.jsonl_path)?;
let line = serde_json::to_string(entry)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
writeln!(file, "{}", line)?;
Ok(())
}
pub fn load_all(&self) -> std::io::Result<Vec<EvalHistoryEntry>> {
if !self.jsonl_path.exists() {
return Ok(Vec::new());
}
let file = File::open(&self.jsonl_path)?;
let reader = BufReader::new(file);
let mut entries = Vec::new();
for line in reader.lines() {
let line = line?;
if line.trim().is_empty() {
continue;
}
if let Ok(entry) = serde_json::from_str::<EvalHistoryEntry>(&line) {
entries.push(entry);
}
}
Ok(entries)
}
pub fn backends(&self) -> std::io::Result<Vec<String>> {
let entries = self.load_all()?;
let backends: std::collections::HashSet<String> =
entries.iter().map(|e| e.backend.clone()).collect();
let mut result: Vec<String> = backends.into_iter().collect();
result.sort();
Ok(result)
}
pub fn datasets(&self) -> std::io::Result<Vec<String>> {
let entries = self.load_all()?;
let datasets: std::collections::HashSet<String> =
entries.iter().map(|e| e.dataset.clone()).collect();
let mut result: Vec<String> = datasets.into_iter().collect();
result.sort();
Ok(result)
}
pub fn stats(&self) -> std::io::Result<HistoryStats> {
let entries = self.load_all()?;
let mut by_backend: HashMap<String, usize> = HashMap::new();
let mut by_dataset: HashMap<String, usize> = HashMap::new();
let mut total_f1: f64 = 0.0;
let mut f1_count: usize = 0;
for entry in &entries {
*by_backend.entry(entry.backend.clone()).or_insert(0) += 1;
*by_dataset.entry(entry.dataset.clone()).or_insert(0) += 1;
if let Some(f1) = entry.f1 {
total_f1 += f1;
f1_count += 1;
}
}
Ok(HistoryStats {
total_entries: entries.len(),
by_backend,
by_dataset,
avg_f1: if f1_count > 0 {
Some(total_f1 / f1_count as f64)
} else {
None
},
})
}
fn init_sqlite(db_path: &Path) -> std::io::Result<()> {
use rusqlite::Connection;
let conn = Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
conn.execute(
"CREATE TABLE IF NOT EXISTS eval_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
backend TEXT NOT NULL,
dataset TEXT NOT NULL,
task TEXT NOT NULL,
seed INTEGER NOT NULL,
f1 REAL,
precision REAL,
recall REAL,
n INTEGER NOT NULL,
duration_ms REAL,
error TEXT,
metadata TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)",
[],
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_backend_dataset ON eval_results(backend, dataset)",
[],
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_timestamp ON eval_results(timestamp)",
[],
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
conn.execute("CREATE INDEX IF NOT EXISTS idx_f1 ON eval_results(f1)", [])
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_backend_timestamp ON eval_results(backend, timestamp)",
[],
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let _ = conn.execute("ALTER TABLE eval_results ADD COLUMN git_commit TEXT", []);
Ok(())
}
fn insert_sqlite(&self, entry: &EvalHistoryEntry, db_path: &Path) -> std::io::Result<()> {
use rusqlite::params;
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let git_commit = cached_git_commit();
conn.execute(
"INSERT INTO eval_results (
timestamp, backend, dataset, task, seed,
f1, precision, recall, n, duration_ms, error, metadata, git_commit
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)",
params![
entry.timestamp,
entry.backend,
entry.dataset,
entry.task,
entry.seed,
entry.f1,
entry.precision,
entry.recall,
entry.n,
entry.duration_ms,
entry.error,
entry.metadata,
git_commit,
],
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
Ok(())
}
pub fn query_recent(
&self,
backend: &str,
limit: usize,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
if let Some(ref db_path) = self.sqlite_path {
return Self::query_recent_sqlite(db_path, backend, limit);
}
let mut entries = self.load_all()?;
entries.retain(|e| e.backend == backend);
entries.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
entries.truncate(limit);
Ok(entries)
}
fn query_recent_sqlite(
db_path: &Path,
backend: &str,
limit: usize,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
use rusqlite::params;
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE backend = ?1
ORDER BY timestamp DESC
LIMIT ?2",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![backend, limit as i64], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let mut entries = Vec::new();
for row in rows {
entries.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
Ok(entries)
}
pub fn query_best(
&self,
backend: &str,
dataset: Option<&str>,
limit: usize,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
if let Some(ref db_path) = self.sqlite_path {
return Self::query_best_sqlite(db_path, backend, dataset, limit);
}
let mut entries = self.load_all()?;
entries.retain(|e| {
e.backend == backend
&& match dataset {
None => true,
Some(d) => e.dataset == d,
}
});
entries.sort_by(|a, b| {
let a_f1 = a.f1.unwrap_or(0.0);
let b_f1 = b.f1.unwrap_or(0.0);
b_f1.partial_cmp(&a_f1).unwrap_or(std::cmp::Ordering::Equal)
});
entries.truncate(limit);
Ok(entries)
}
fn query_best_sqlite(
db_path: &Path,
backend: &str,
dataset: Option<&str>,
limit: usize,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
use rusqlite::params;
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let mut entries = Vec::new();
if let Some(ds) = dataset {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE backend = ?1 AND dataset = ?2 AND f1 IS NOT NULL
ORDER BY f1 DESC
LIMIT ?3",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![backend, ds, limit as i64], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
} else {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE backend = ?1 AND f1 IS NOT NULL
ORDER BY f1 DESC
LIMIT ?2",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![backend, limit as i64], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
}
Ok(entries)
}
pub fn query_by_date_range(
&self,
start_date: &str,
end_date: &str,
backend: Option<&str>,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
if let Some(ref db_path) = self.sqlite_path {
return Self::query_by_date_range_sqlite(db_path, start_date, end_date, backend);
}
let mut entries = self.load_all()?;
entries.retain(|e| {
e.timestamp.as_str() >= start_date
&& e.timestamp.as_str() <= end_date
&& match backend {
None => true,
Some(b) => e.backend == b,
}
});
entries.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
Ok(entries)
}
fn query_by_date_range_sqlite(
db_path: &Path,
start_date: &str,
end_date: &str,
backend: Option<&str>,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
use rusqlite::params;
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let mut entries = Vec::new();
if let Some(b) = backend {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE timestamp >= ?1 AND timestamp <= ?2 AND backend = ?3
ORDER BY timestamp DESC",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![start_date, end_date, b], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
} else {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE timestamp >= ?1 AND timestamp <= ?2
ORDER BY timestamp DESC",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![start_date, end_date], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
}
Ok(entries)
}
pub fn compare_backends(
&self,
backend1: &str,
backend2: &str,
dataset: Option<&str>,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
if let Some(ref db_path) = self.sqlite_path {
return Self::compare_backends_sqlite(db_path, backend1, backend2, dataset);
}
let mut entries = self.load_all()?;
entries.retain(|e| {
(e.backend == backend1 || e.backend == backend2)
&& match dataset {
None => true,
Some(d) => e.dataset == d,
}
});
entries.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
Ok(entries)
}
fn compare_backends_sqlite(
db_path: &Path,
backend1: &str,
backend2: &str,
dataset: Option<&str>,
) -> std::io::Result<Vec<EvalHistoryEntry>> {
use rusqlite::params;
let conn = rusqlite::Connection::open(db_path)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let mut entries = Vec::new();
if let Some(ds) = dataset {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE (backend = ?1 OR backend = ?2) AND dataset = ?3
ORDER BY timestamp DESC",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![backend1, backend2, ds], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
} else {
let mut stmt = conn
.prepare(
"SELECT timestamp, backend, dataset, task, seed, f1, precision, recall, n, duration_ms, error, metadata
FROM eval_results
WHERE backend = ?1 OR backend = ?2
ORDER BY timestamp DESC",
)
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
let rows = stmt
.query_map(params![backend1, backend2], |row| {
Ok(EvalHistoryEntry {
timestamp: row.get(0)?,
backend: row.get(1)?,
dataset: row.get(2)?,
task: row.get(3)?,
seed: row.get(4)?,
f1: row.get(5)?,
precision: row.get(6)?,
recall: row.get(7)?,
n: row.get(8)?,
duration_ms: row.get(9)?,
error: row.get(10)?,
metadata: row.get(11)?,
})
})
.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?;
for row in rows {
entries
.push(row.map_err(|e| std::io::Error::other(format!("SQLite error: {}", e)))?);
}
}
Ok(entries)
}
pub fn cell_observation_counts(&self) -> std::io::Result<HashMap<(String, String), u64>> {
if let Some(ref db_path) = self.sqlite_path {
if let Ok(conn) = rusqlite::Connection::open(db_path) {
let mut stmt = conn
.prepare(
"SELECT backend, dataset, COUNT(*) FROM eval_results GROUP BY backend, dataset",
)
.map_err(|e| std::io::Error::other(e.to_string()))?;
let mut counts = HashMap::new();
let rows = stmt
.query_map([], |row| {
let backend: String = row.get(0)?;
let dataset: String = row.get(1)?;
let count: u64 = row.get(2)?;
Ok((backend, dataset, count))
})
.map_err(|e| std::io::Error::other(e.to_string()))?;
for (b, d, c) in rows.flatten() {
counts.insert((b, d), c);
}
return Ok(counts);
}
}
let entries = self.load_all()?;
let mut counts = HashMap::new();
for e in entries {
*counts
.entry((e.backend.clone(), e.dataset.clone()))
.or_insert(0u64) += 1;
}
Ok(counts)
}
pub fn dataset_observation_counts(&self) -> std::io::Result<HashMap<String, u64>> {
if let Some(ref db_path) = self.sqlite_path {
if let Ok(conn) = rusqlite::Connection::open(db_path) {
let mut stmt = conn
.prepare("SELECT dataset, COUNT(*) FROM eval_results GROUP BY dataset")
.map_err(|e| std::io::Error::other(e.to_string()))?;
let mut counts = HashMap::new();
let rows = stmt
.query_map([], |row| {
let dataset: String = row.get(0)?;
let count: u64 = row.get(1)?;
Ok((dataset, count))
})
.map_err(|e| std::io::Error::other(e.to_string()))?;
for (d, c) in rows.flatten() {
counts.insert(d, c);
}
return Ok(counts);
}
}
let entries = self.load_all()?;
let mut counts = HashMap::new();
for e in entries {
*counts.entry(e.dataset.clone()).or_insert(0u64) += 1;
}
Ok(counts)
}
pub fn backend_observation_counts(&self) -> std::io::Result<HashMap<String, u64>> {
if let Some(ref db_path) = self.sqlite_path {
if let Ok(conn) = rusqlite::Connection::open(db_path) {
let mut stmt = conn
.prepare("SELECT backend, COUNT(*) FROM eval_results GROUP BY backend")
.map_err(|e| std::io::Error::other(e.to_string()))?;
let mut counts = HashMap::new();
let rows = stmt
.query_map([], |row| {
let backend: String = row.get(0)?;
let count: u64 = row.get(1)?;
Ok((backend, count))
})
.map_err(|e| std::io::Error::other(e.to_string()))?;
for (b, c) in rows.flatten() {
counts.insert(b, c);
}
return Ok(counts);
}
}
let entries = self.load_all()?;
let mut counts = HashMap::new();
for e in entries {
*counts.entry(e.backend.clone()).or_insert(0u64) += 1;
}
Ok(counts)
}
pub fn detect_regressions(
&self,
min_observations: u64,
min_drop: f64,
) -> std::io::Result<Vec<RegressionAlert>> {
let Some(ref db_path) = self.sqlite_path else {
return Ok(Vec::new());
};
let conn = rusqlite::Connection::open(db_path).map_err(std::io::Error::other)?;
let mut stmt = conn
.prepare(
"SELECT backend, dataset, timestamp, f1 FROM eval_results \
WHERE f1 IS NOT NULL AND error IS NULL \
ORDER BY backend, dataset, timestamp",
)
.map_err(std::io::Error::other)?;
let mut cells: HashMap<(String, String), Vec<(String, f64)>> = HashMap::new();
let rows = stmt
.query_map([], |row| {
let b: String = row.get(0)?;
let d: String = row.get(1)?;
let ts: String = row.get(2)?;
let f1: f64 = row.get(3)?;
Ok((b, d, ts, f1))
})
.map_err(std::io::Error::other)?;
for row in rows.flatten() {
let (b, d, ts, f1) = row;
cells.entry((b, d)).or_default().push((ts, f1));
}
let mut alerts = Vec::new();
for ((backend, dataset), obs) in &cells {
let n = obs.len() as u64;
if n < min_observations {
continue;
}
let mid = obs.len() / 2;
let old_vals: Vec<f64> = obs[..mid].iter().map(|(_, f)| *f).collect();
let new_vals: Vec<f64> = obs[mid..].iter().map(|(_, f)| *f).collect();
if old_vals.is_empty() || new_vals.is_empty() {
continue;
}
let old_mean = old_vals.iter().sum::<f64>() / old_vals.len() as f64;
let new_mean = new_vals.iter().sum::<f64>() / new_vals.len() as f64;
let drop = old_mean - new_mean;
if drop >= min_drop {
alerts.push(RegressionAlert {
backend: backend.clone(),
dataset: dataset.clone(),
old_mean,
new_mean,
drop,
n_old: old_vals.len() as u64,
n_new: new_vals.len() as u64,
split_timestamp: obs[mid].0.clone(),
});
}
}
alerts.sort_by(|a, b| b.drop.total_cmp(&a.drop));
Ok(alerts)
}
pub fn detect_regressions_recent(
&self,
recent_n: usize,
min_effect_size: f64,
min_total: u64,
) -> std::io::Result<Vec<RegressionAlert>> {
let Some(ref db_path) = self.sqlite_path else {
return Ok(Vec::new());
};
let conn = rusqlite::Connection::open(db_path).map_err(std::io::Error::other)?;
let mut stmt = conn
.prepare(
"SELECT backend, dataset, timestamp, f1, n FROM eval_results \
WHERE f1 IS NOT NULL AND error IS NULL \
ORDER BY backend, dataset, timestamp",
)
.map_err(std::io::Error::other)?;
type CellKey = (String, String);
type CellRow = (String, f64, i64);
let mut cells: HashMap<CellKey, Vec<CellRow>> = HashMap::new();
let rows = stmt
.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, String>(2)?,
row.get::<_, f64>(3)?,
row.get::<_, i64>(4)?,
))
})
.map_err(std::io::Error::other)?;
for row in rows.flatten() {
cells
.entry((row.0, row.1))
.or_default()
.push((row.2, row.3, row.4));
}
let mut alerts = Vec::new();
for ((backend, dataset), obs) in &cells {
if (obs.len() as u64) < min_total || obs.len() <= recent_n {
continue;
}
let split = obs.len() - recent_n;
let recent_n_vals: Vec<i64> = obs[split..].iter().map(|(_, _, n)| *n).collect();
let median_recent_n = {
let mut sorted = recent_n_vals.clone();
sorted.sort();
sorted[sorted.len() / 2]
};
let old: Vec<f64> = obs[..split]
.iter()
.filter(|(_, _, n)| *n >= median_recent_n / 2 && *n <= median_recent_n * 2)
.map(|(_, f, _)| *f)
.collect();
let new: Vec<f64> = obs[split..].iter().map(|(_, f, _)| *f).collect();
if old.len() < 3 || new.is_empty() {
continue; }
let old_mean = old.iter().sum::<f64>() / old.len() as f64;
let new_mean = new.iter().sum::<f64>() / new.len() as f64;
let drop = old_mean - new_mean;
if drop <= 0.0 {
continue;
}
let old_var = old.iter().map(|x| (x - old_mean).powi(2)).sum::<f64>()
/ (old.len() as f64 - 1.0).max(1.0);
let new_var = new.iter().map(|x| (x - new_mean).powi(2)).sum::<f64>()
/ (new.len() as f64 - 1.0).max(1.0);
let pooled_sd = ((old_var + new_var) / 2.0).sqrt();
if pooled_sd < 1e-12 {
continue;
}
let d = drop / pooled_sd;
if d >= min_effect_size {
alerts.push(RegressionAlert {
backend: backend.clone(),
dataset: dataset.clone(),
old_mean,
new_mean,
drop,
n_old: old.len() as u64,
n_new: new.len() as u64,
split_timestamp: obs[split].0.clone(),
});
}
}
alerts.sort_by(|a, b| b.drop.total_cmp(&a.drop));
Ok(alerts)
}
pub fn detect_regressions_by_commit(
&self,
old_commit: &str,
new_commit: &str,
min_drop: f64,
) -> std::io::Result<Vec<RegressionAlert>> {
let Some(ref db_path) = self.sqlite_path else {
return Ok(Vec::new());
};
let conn = rusqlite::Connection::open(db_path).map_err(std::io::Error::other)?;
let has_column = conn
.prepare("SELECT git_commit FROM eval_results LIMIT 0")
.is_ok();
if !has_column {
return Ok(Vec::new());
}
let mut stmt = conn
.prepare(
"SELECT backend, dataset, git_commit, f1 FROM eval_results \
WHERE f1 IS NOT NULL AND error IS NULL \
AND git_commit IN (?1, ?2) \
ORDER BY backend, dataset",
)
.map_err(std::io::Error::other)?;
let mut old_cells: HashMap<(String, String), Vec<f64>> = HashMap::new();
let mut new_cells: HashMap<(String, String), Vec<f64>> = HashMap::new();
let rows = stmt
.query_map(rusqlite::params![old_commit, new_commit], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, String>(2)?,
row.get::<_, f64>(3)?,
))
})
.map_err(std::io::Error::other)?;
for row in rows.flatten() {
let (b, d, commit, f1) = row;
if commit == old_commit {
old_cells.entry((b, d)).or_default().push(f1);
} else {
new_cells.entry((b, d)).or_default().push(f1);
}
}
let mut alerts = Vec::new();
for ((backend, dataset), old_vals) in &old_cells {
let Some(new_vals) = new_cells.get(&(backend.clone(), dataset.clone())) else {
continue;
};
if old_vals.is_empty() || new_vals.is_empty() {
continue;
}
let old_mean = old_vals.iter().sum::<f64>() / old_vals.len() as f64;
let new_mean = new_vals.iter().sum::<f64>() / new_vals.len() as f64;
let drop = old_mean - new_mean;
if drop >= min_drop {
alerts.push(RegressionAlert {
backend: backend.clone(),
dataset: dataset.clone(),
old_mean,
new_mean,
drop,
n_old: old_vals.len() as u64,
n_new: new_vals.len() as u64,
split_timestamp: format!("{} -> {}", old_commit, new_commit),
});
}
}
alerts.sort_by(|a, b| b.drop.total_cmp(&a.drop));
Ok(alerts)
}
pub fn rebuild_index(&self) -> std::io::Result<()> {
if let Some(ref db_path) = self.sqlite_path {
if db_path.exists() {
std::fs::remove_file(db_path)?;
}
Self::init_sqlite(db_path)?;
let entries = self.load_all()?;
for entry in &entries {
self.insert_sqlite(entry, db_path)?;
}
eprintln!(
"[history] Rebuilt SQLite index with {} entries",
entries.len()
);
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct RegressionAlert {
pub backend: String,
pub dataset: String,
pub old_mean: f64,
pub new_mean: f64,
pub drop: f64,
pub n_old: u64,
pub n_new: u64,
pub split_timestamp: String,
}
#[derive(Debug, Clone)]
pub struct HistoryStats {
pub total_entries: usize,
pub by_backend: HashMap<String, usize>,
pub by_dataset: HashMap<String, usize>,
pub avg_f1: Option<f64>,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_append_and_load() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: "test-backend".to_string(),
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.85);
m.insert("precision".to_string(), 0.90);
m.insert("recall".to_string(), 0.80);
m
},
num_examples: 100,
duration_ms: Some(5000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
let entries = history.load_all().expect("load failed");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].backend, "test-backend");
assert_eq!(entries[0].f1, Some(0.85));
}
#[test]
fn test_stats() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
for i in 0..5 {
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: format!("backend-{}", i % 2),
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.8 + (i as f64 * 0.01));
m
},
num_examples: 100,
duration_ms: Some(5000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
}
let stats = history.stats().expect("stats failed");
assert_eq!(stats.total_entries, 5);
assert_eq!(stats.by_backend.len(), 2);
assert!(stats.avg_f1.is_some());
}
#[test]
fn test_query_recent() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
for i in 0..10 {
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: if i % 2 == 0 {
"backend-a".to_string()
} else {
"backend-b".to_string()
},
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.8);
m
},
num_examples: 100,
duration_ms: Some(1000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
}
let recent = history.query_recent("backend-a", 3).expect("query failed");
assert_eq!(recent.len(), 3);
assert!(recent.iter().all(|e| e.backend == "backend-a"));
}
#[test]
fn test_query_best() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
for i in 0..5 {
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: "test-backend".to_string(),
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.5 + (i as f64 * 0.1));
m
},
num_examples: 100,
duration_ms: Some(1000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
}
let best = history
.query_best("test-backend", None, 3)
.expect("query failed");
assert_eq!(best.len(), 3);
assert!(best[0].f1.unwrap() > best[1].f1.unwrap());
assert!(best[1].f1.unwrap() > best[2].f1.unwrap());
}
#[test]
fn test_backends_and_datasets() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
let backends = ["backend-a", "backend-b", "backend-c"];
for backend in backends.iter() {
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: backend.to_string(),
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.8);
m
},
num_examples: 100,
duration_ms: Some(1000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
}
let backends_list = history.backends().expect("backends failed");
assert_eq!(backends_list.len(), 3);
assert!(backends_list.contains(&"backend-a".to_string()));
let datasets_list = history.datasets().expect("datasets failed");
assert!(!datasets_list.is_empty());
}
#[test]
fn test_rebuild_index() {
let temp = TempDir::new().expect("failed to create temp dir");
let jsonl_path = temp.path().join("history.jsonl");
let history = EvalHistory::new(&jsonl_path).expect("failed to create history");
let result = TaskEvalResult {
task: crate::eval::task_mapping::Task::NER,
dataset: crate::eval::loader::DatasetId::WikiGold,
backend: "test-backend".to_string(),
backend_display: None,
seed: 42,
success: true,
error: None,
metrics: {
let mut m = HashMap::new();
m.insert("f1".to_string(), 0.85);
m
},
num_examples: 100,
duration_ms: Some(1000.0),
label_shift: None,
robustness: None,
stratified: None,
confidence_intervals: None,
kb_version: None,
};
history.append_result(&result).expect("append failed");
history.rebuild_index().expect("rebuild failed");
let stats = history.stats().expect("stats failed");
assert_eq!(stats.total_entries, 1);
}
}