use rusqlite::{params, Connection};
use crate::core::effort::effort_tshirt_from_size;
use crate::core::errors::{Result, TgaError};
const MIN_CORPUS_SIZE: usize = 5;
const DEFAULT_DATASET: &str = "default";
#[derive(Debug, Clone, PartialEq)]
pub struct EffortPercentileThresholds {
pub p20: f64,
pub p40: f64,
pub p60: f64,
pub p80: f64,
pub sample_count: usize,
}
impl EffortPercentileThresholds {
pub fn band_for_score(&self, score: f64) -> i64 {
if score < self.p20 {
1
} else if score < self.p40 {
2
} else if score < self.p60 {
3
} else if score < self.p80 {
4
} else {
5
}
}
}
pub fn load_thresholds(conn: &Connection) -> Result<Option<EffortPercentileThresholds>> {
let result = conn.query_row(
"SELECT p20, p40, p60, p80, sample_count \
FROM effort_percentile_thresholds \
WHERE dataset = ?1",
params![DEFAULT_DATASET],
|row| {
Ok(EffortPercentileThresholds {
p20: row.get(0)?,
p40: row.get(1)?,
p60: row.get(2)?,
p80: row.get(3)?,
sample_count: row.get::<_, i64>(4)? as usize,
})
},
);
match result {
Ok(t) => Ok(Some(t)),
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
Err(e) => Err(TgaError::from(e)),
}
}
pub fn persist_thresholds(
conn: &Connection,
thresholds: &EffortPercentileThresholds,
) -> Result<()> {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
conn.execute(
"INSERT OR REPLACE INTO effort_percentile_thresholds \
(dataset, p20, p40, p60, p80, sample_count, computed_at) \
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
DEFAULT_DATASET,
thresholds.p20,
thresholds.p40,
thresholds.p60,
thresholds.p80,
thresholds.sample_count as i64,
now,
],
)
.map_err(TgaError::from)?;
Ok(())
}
pub fn compute_percentiles(scores: &[f64]) -> Option<EffortPercentileThresholds> {
if scores.len() < MIN_CORPUS_SIZE {
return None;
}
let mut sorted = scores.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
let percentile_value = |p: f64| -> f64 {
let rank = ((p / 100.0) * n as f64).ceil() as usize;
let idx = rank.saturating_sub(1).min(n - 1);
sorted[idx]
};
Some(EffortPercentileThresholds {
p20: percentile_value(20.0),
p40: percentile_value(40.0),
p60: percentile_value(60.0),
p80: percentile_value(80.0),
sample_count: n,
})
}
pub fn rebin_all(conn: &mut Connection) -> Result<(usize, Option<EffortPercentileThresholds>)> {
let rows: Vec<(String, String, f64, String)> = {
let mut stmt = conn
.prepare("SELECT sha, repository, score, size FROM fact_commit_effort")
.map_err(TgaError::from)?;
let iter = stmt
.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, f64>(2)?,
row.get::<_, String>(3)?,
))
})
.map_err(TgaError::from)?;
let mut v = Vec::new();
for r in iter {
v.push(r.map_err(TgaError::from)?);
}
v
};
let scores: Vec<f64> = rows.iter().map(|(_, _, s, _)| *s).collect();
let thresholds = compute_percentiles(&scores);
if thresholds.is_none() {
tracing::warn!(
count = rows.len(),
min_required = MIN_CORPUS_SIZE,
"effort percentile: corpus too small for percentile binning; \
falling back to static size-label mapping"
);
}
if let Some(ref t) = thresholds {
persist_thresholds(conn, t)?;
}
let updates: Vec<(i64, String, String)> = rows
.iter()
.map(|(sha, repo, score, size)| {
let tshirt = match &thresholds {
Some(t) => t.band_for_score(*score),
None => effort_tshirt_from_size(size),
};
(tshirt, sha.clone(), repo.clone())
})
.collect();
let tx = conn.transaction().map_err(TgaError::from)?;
{
let mut stmt = tx
.prepare(
"UPDATE fact_commit_effort SET effort_tshirt = ?1 \
WHERE sha = ?2 AND repository = ?3",
)
.map_err(TgaError::from)?;
for (tshirt, sha, repo) in &updates {
stmt.execute(params![tshirt, sha, repo])
.map_err(TgaError::from)?;
}
}
tx.commit().map_err(TgaError::from)?;
Ok((updates.len(), thresholds))
}
pub fn tshirt_for_score_incremental(
conn: &Connection,
score: f64,
size_label: &str,
) -> Result<i64> {
match load_thresholds(conn)? {
Some(t) => Ok(t.band_for_score(score)),
None => Ok(effort_tshirt_from_size(size_label)),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::db::Database;
use rusqlite::params;
fn insert_effort_row(conn: &Connection, sha: &str, repo: &str, score: f64, size: &str) {
conn.execute(
"INSERT OR REPLACE INTO fact_commit_effort \
(sha, repository, size, score, loc, files, test_loc, tests_factor, \
formula_version, computed_at, effort_tshirt) \
VALUES (?1, ?2, ?3, ?4, 10, 1, 0, 1.0, 'v1', 0, 0)",
params![sha, repo, size, score],
)
.expect("insert effort row");
}
#[test]
fn compute_percentiles_known_distribution() {
let scores: Vec<f64> = (1..=10).map(|v| v as f64).collect();
let t = compute_percentiles(&scores).expect("thresholds computed");
assert!((t.p20 - 2.0).abs() < 1e-9, "p20 expected 2.0 got {}", t.p20);
assert!((t.p40 - 4.0).abs() < 1e-9, "p40 expected 4.0 got {}", t.p40);
assert!((t.p60 - 6.0).abs() < 1e-9, "p60 expected 6.0 got {}", t.p60);
assert!((t.p80 - 8.0).abs() < 1e-9, "p80 expected 8.0 got {}", t.p80);
assert_eq!(t.sample_count, 10);
}
#[test]
fn compute_percentiles_tiny_corpus_returns_none() {
let scores = vec![1.0_f64, 2.0, 3.0];
let result = compute_percentiles(&scores);
assert!(result.is_none(), "tiny corpus must return None, not panic");
}
#[test]
fn band_assignment_uses_stored_thresholds() {
let t = EffortPercentileThresholds {
p20: 5.0,
p40: 10.0,
p60: 15.0,
p80: 20.0,
sample_count: 100,
};
assert_eq!(t.band_for_score(0.0), 1, "score below p20 → band 1");
assert_eq!(t.band_for_score(4.9), 1);
assert_eq!(t.band_for_score(5.0), 2, "score at p20 → band 2");
assert_eq!(t.band_for_score(9.9), 2);
assert_eq!(t.band_for_score(10.0), 3, "score at p40 → band 3");
assert_eq!(t.band_for_score(14.9), 3);
assert_eq!(t.band_for_score(15.0), 4, "score at p60 → band 4");
assert_eq!(t.band_for_score(19.9), 4);
assert_eq!(t.band_for_score(20.0), 5, "score at p80 → band 5");
assert_eq!(t.band_for_score(999.0), 5);
}
#[test]
fn percentile_thresholds_round_trip() {
let db = Database::open_in_memory().expect("open db");
let conn = db.connection();
let t = EffortPercentileThresholds {
p20: 3.5,
p40: 7.0,
p60: 11.5,
p80: 17.25,
sample_count: 42,
};
persist_thresholds(conn, &t).expect("persist");
let loaded = load_thresholds(conn).expect("load").expect("must be Some");
assert!((loaded.p20 - t.p20).abs() < 1e-9, "p20 round-trip");
assert!((loaded.p40 - t.p40).abs() < 1e-9, "p40 round-trip");
assert!((loaded.p60 - t.p60).abs() < 1e-9, "p60 round-trip");
assert!((loaded.p80 - t.p80).abs() < 1e-9, "p80 round-trip");
assert_eq!(loaded.sample_count, t.sample_count);
}
#[test]
fn rebin_assigns_quintiles() {
let mut db = Database::open_in_memory().expect("open db");
{
let conn = db.connection();
for i in 1..=10u32 {
let sha = format!("sha{i:03}");
insert_effort_row(conn, &sha, "repo", i as f64, "M");
}
}
let (updated, thresholds) = rebin_all(db.connection_mut()).expect("rebin");
assert_eq!(updated, 10, "all 10 rows must be rebinned");
let t = thresholds.expect("thresholds computed for 10-row corpus");
assert!((t.p20 - 2.0).abs() < 1e-9, "p20");
assert!((t.p80 - 8.0).abs() < 1e-9, "p80");
let conn = db.connection();
let bands: Vec<(i64, i64)> = {
let mut stmt = conn
.prepare(
"SELECT CAST(score AS INTEGER), effort_tshirt \
FROM fact_commit_effort \
ORDER BY score ASC",
)
.expect("prepare");
stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?)))
.expect("query")
.map(|r| r.expect("row"))
.collect()
};
assert_eq!(bands[0], (1, 1), "score=1 → band 1");
assert_eq!(bands[1], (2, 2), "score=2 → band 2 (at p20)");
assert_eq!(bands[9], (10, 5), "score=10 → band 5");
}
#[test]
fn rebin_tiny_corpus_fallback() {
let mut db = Database::open_in_memory().expect("open db");
{
let conn = db.connection();
for i in 1..=3u32 {
insert_effort_row(conn, &format!("sha{i}"), "repo", i as f64, "M");
}
}
let (updated, thresholds) = rebin_all(db.connection_mut()).expect("rebin tiny");
assert_eq!(updated, 3);
assert!(
thresholds.is_none(),
"tiny corpus must yield None thresholds"
);
let conn = db.connection();
let tshirts: Vec<i64> = {
let mut stmt = conn
.prepare("SELECT effort_tshirt FROM fact_commit_effort")
.expect("prepare");
stmt.query_map([], |r| r.get(0))
.expect("query")
.map(|r| r.expect("row"))
.collect()
};
assert!(
tshirts.iter().all(|&v| v == 3),
"all rows should be effort_tshirt=3 (static M mapping), got {tshirts:?}"
);
}
#[test]
fn incremental_bins_against_stored_thresholds() {
let db = Database::open_in_memory().expect("open db");
let conn = db.connection();
let t = EffortPercentileThresholds {
p20: 5.0,
p40: 10.0,
p60: 15.0,
p80: 20.0,
sample_count: 50,
};
persist_thresholds(conn, &t).expect("persist");
let band1 = tshirt_for_score_incremental(conn, 4.9, "XS").expect("band1");
let band2 = tshirt_for_score_incremental(conn, 5.0, "S").expect("band2");
let band5 = tshirt_for_score_incremental(conn, 25.0, "XL").expect("band5");
assert_eq!(band1, 1, "score < p20 → band 1");
assert_eq!(band2, 2, "score at p20 → band 2");
assert_eq!(band5, 5, "score >= p80 → band 5");
}
#[test]
fn incremental_fallback_when_no_stored_thresholds() {
let db = Database::open_in_memory().expect("open db");
let conn = db.connection();
let result = tshirt_for_score_incremental(conn, 12.0, "M").expect("fallback");
assert_eq!(result, 3, "static M → 3");
}
}