use clap::{Args, Subcommand};
use git2::{Repository, Sort};
use rusqlite::{params, Connection};
use tga::collect::git::scan_and_persist;
use tga::collect::ticket::{extract_ticket_id, is_ticketed};
use tga::core::config::{expand_path, Config};
use tga::core::db::Database;
use tga::core::effort::{compute_effort, FORMULA_VERSION};
#[derive(Args, Debug)]
pub struct BackfillArgs {
#[command(subcommand)]
pub subcommand: BackfillSubcommand,
#[arg(long, default_value_t = false, global = true)]
pub dry_run: bool,
}
#[derive(Subcommand, Debug)]
pub enum BackfillSubcommand {
AiDetection,
RevertFlags,
TicketIds,
Reachability(ReachabilityBackfillArgs),
Effort(EffortBackfillArgs),
}
#[derive(Args, Debug)]
pub struct ReachabilityBackfillArgs {
#[arg(long = "repo", value_name = "NAME")]
pub repos: Vec<String>,
}
#[derive(Args, Debug)]
pub struct EffortBackfillArgs {
#[arg(long = "repo", value_name = "NAME")]
pub repo: Option<String>,
#[arg(long, value_name = "RANGE")]
pub range: Option<String>,
#[arg(long, default_value_t = false)]
pub force: bool,
#[arg(long, default_value_t = false)]
pub notes: bool,
#[arg(long, value_name = "N")]
pub limit: Option<usize>,
}
pub fn run(config: Config, db: &mut Database, args: BackfillArgs) -> anyhow::Result<()> {
match args.subcommand {
BackfillSubcommand::AiDetection => backfill_ai_detection(db, args.dry_run),
BackfillSubcommand::RevertFlags => backfill_revert_flags(db, args.dry_run),
BackfillSubcommand::TicketIds => backfill_ticket_ids(db, args.dry_run),
BackfillSubcommand::Reachability(reach_args) => {
backfill_reachability(config, db, reach_args, args.dry_run)
}
BackfillSubcommand::Effort(effort_args) => {
backfill_effort(config, db, effort_args, args.dry_run)
}
}
}
fn backfill_effort(
config: Config,
db: &mut Database,
args: EffortBackfillArgs,
dry_run: bool,
) -> anyhow::Result<()> {
let repos_to_process: Vec<(std::path::PathBuf, String)> = config
.repositories
.iter()
.filter_map(|repo_cfg| {
let path = expand_path(&repo_cfg.path);
let name = repo_cfg
.name
.clone()
.or_else(|| {
path.file_name()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
})
.unwrap_or_else(|| path.display().to_string());
if let Some(ref filter) = args.repo {
if &name != filter {
return None;
}
}
Some((path, name))
})
.collect();
if repos_to_process.is_empty() {
println!("No matching repositories found in config.");
return Ok(());
}
let use_git_path = args.range.is_some() || args.notes;
let mut total_scored: usize = 0;
let mut total_skipped: usize = 0;
let mut total_repos: usize = 0;
let mut size_counts = [0usize; 5];
for (repo_path, repo_name) in &repos_to_process {
let result = if use_git_path {
process_one_repo_git(repo_path, repo_name, db, &args, dry_run)
} else {
process_one_repo_db(db.connection(), repo_name, &args, dry_run).and_then(
|(scored, skipped, sizes, rows)| {
if !dry_run {
persist_effort_rows(db, &rows)?;
}
Ok((scored, skipped, sizes))
},
)
};
match result {
Ok((scored, skipped, sizes)) => {
total_repos += 1;
total_scored += scored;
total_skipped += skipped;
for i in 0..5 {
size_counts[i] += sizes[i];
}
let verb = if dry_run { "would score" } else { "scored" };
println!(
" {repo_name}: {verb} {scored} commits, skipped {skipped} already-scored"
);
}
Err(e) => {
tracing::warn!(repo = %repo_name, error = %e, "backfill effort failed for repo");
println!(" {repo_name}: error — {e}");
}
}
}
let verb = if dry_run { "Would score" } else { "Scored" };
println!(
"\nBackfill complete: {total_repos} repos, {verb} {total_scored} commits \
({} skipped already-scored).",
total_skipped,
);
println!(
" Size distribution: XS={} S={} M={} L={} XL={}",
size_counts[0], size_counts[1], size_counts[2], size_counts[3], size_counts[4],
);
Ok(())
}
fn process_one_repo_db(
conn: &Connection,
repo_name: &str,
args: &EffortBackfillArgs,
dry_run: bool,
) -> anyhow::Result<(usize, usize, [usize; 5], Vec<EffortRow>)> {
let already_scored: std::collections::HashSet<String> = if args.force {
std::collections::HashSet::new()
} else {
let mut stmt = conn.prepare("SELECT sha FROM fact_commit_effort WHERE repository = ?1")?;
let rows = stmt.query_map(params![repo_name], |row| row.get::<_, String>(0))?;
let mut set = std::collections::HashSet::new();
for r in rows {
set.insert(r?);
}
set
};
let in_db: i64 = conn
.query_row(
"SELECT COUNT(DISTINCT c.sha) FROM commits c WHERE c.repository = ?1",
params![repo_name],
|r| r.get(0),
)
.unwrap_or(0);
tracing::info!(
repo = %repo_name,
in_db = in_db,
already_scored = already_scored.len(),
"effort backfill db path: starting"
);
let mut stmt = conn.prepare(
"SELECT c.sha, f.path, f.insertions, f.deletions \
FROM commits c \
JOIN files f ON f.commit_id = c.id \
WHERE c.repository = ?1 \
ORDER BY c.timestamp ASC, c.sha ASC",
)?;
let limit = args.limit.unwrap_or(usize::MAX);
let mut records: Vec<EffortRow> = Vec::new();
let mut skipped: usize = 0;
let mut current_sha: Option<String> = None;
let mut current_files: Vec<(String, u32, u32)> = Vec::new();
let flush = |sha: &str,
files: &[(String, u32, u32)],
already_scored: &std::collections::HashSet<String>,
records: &mut Vec<EffortRow>,
skipped: &mut usize|
-> bool {
if records.len() >= limit {
return false;
}
if already_scored.contains(sha) {
*skipped += 1;
return true; }
if files.is_empty() {
tracing::warn!(
sha = %sha,
"commit has no rows in the files table; skipping effort computation"
);
return true;
}
let file_refs: Vec<(&str, u32, u32)> =
files.iter().map(|(p, i, d)| (p.as_str(), *i, *d)).collect();
let effort = compute_effort(file_refs);
let computed_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
records.push(EffortRow {
sha: sha.to_string(),
repository: repo_name.to_string(),
size: effort.size_label().to_string(),
score: effort.score,
loc: effort.loc,
files: effort.files,
test_loc: effort.test_loc,
tests_factor: effort.tests_factor,
formula_version: FORMULA_VERSION.to_string(),
computed_at,
});
if records.len().is_multiple_of(1000) {
tracing::info!(
repo = %repo_name,
processed = records.len(),
"effort backfill db path: progress"
);
}
true
};
let rows = stmt.query_map(params![repo_name], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, u32>(2)?,
row.get::<_, u32>(3)?,
))
})?;
for row_res in rows {
let (sha, path, ins, del) = row_res?;
match ¤t_sha {
None => {
current_sha = Some(sha.clone());
current_files.push((path, ins, del));
}
Some(cur) if cur == &sha => {
current_files.push((path, ins, del));
}
Some(_) => {
let prev_sha = current_sha.take().expect("just checked Some");
let should_continue = flush(
&prev_sha,
¤t_files,
&already_scored,
&mut records,
&mut skipped,
);
current_files.clear();
if !should_continue || records.len() >= limit {
break;
}
current_sha = Some(sha.clone());
current_files.push((path, ins, del));
}
}
}
if let Some(last_sha) = current_sha.take() {
if records.len() < limit {
flush(
&last_sha,
¤t_files,
&already_scored,
&mut records,
&mut skipped,
);
}
}
let mut size_counts = [0usize; 5];
for row in &records {
let idx = match row.size.as_str() {
"XS" => 0,
"S" => 1,
"M" => 2,
"L" => 3,
_ => 4, };
size_counts[idx] += 1;
}
tracing::info!(
repo = %repo_name,
in_db = in_db,
scored = records.len(),
skipped = skipped,
dry_run = dry_run,
"effort backfill db path: complete"
);
Ok((records.len(), skipped, size_counts, records))
}
fn process_one_repo_git(
repo_path: &std::path::Path,
repo_name: &str,
db: &mut Database,
args: &EffortBackfillArgs,
dry_run: bool,
) -> anyhow::Result<(usize, usize, [usize; 5])> {
let repo = Repository::open(repo_path)
.map_err(|e| anyhow::anyhow!("cannot open git repo {}: {e}", repo_path.display()))?;
let already_scored: std::collections::HashSet<String> = if args.force {
std::collections::HashSet::new()
} else {
let conn = db.connection();
let mut stmt = conn.prepare("SELECT sha FROM fact_commit_effort WHERE repository = ?1")?;
let rows = stmt.query_map(params![repo_name], |row| row.get::<_, String>(0))?;
let mut set = std::collections::HashSet::new();
for r in rows {
set.insert(r?);
}
set
};
let mut revwalk = repo.revwalk()?;
revwalk.set_sorting(Sort::TIME)?;
if let Some(ref range) = args.range {
if let Some((base, tip)) = range.split_once("..") {
let tip_oid = repo
.revparse_single(tip.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{tip}': {e}"))?
.id();
revwalk.push(tip_oid)?;
if !base.trim().is_empty() {
let base_oid = repo
.revparse_single(base.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{base}': {e}"))?
.id();
revwalk.hide(base_oid)?;
}
} else {
let oid = repo
.revparse_single(range.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{range}': {e}"))?
.id();
revwalk.push(oid)?;
}
} else {
let _ = revwalk.push_head();
}
let mut records: Vec<EffortRow> = Vec::new();
let mut skipped: usize = 0;
let limit = args.limit.unwrap_or(usize::MAX);
for oid_res in revwalk {
if records.len() >= limit {
break;
}
let oid = match oid_res {
Ok(o) => o,
Err(e) => {
tracing::warn!(repo = %repo_name, error = %e, "revwalk error; stopping");
break;
}
};
let sha_str = oid.to_string();
if already_scored.contains(&sha_str) {
skipped += 1;
continue;
}
let commit = match repo.find_commit(oid) {
Ok(c) => c,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot find commit; skipping");
continue;
}
};
let tree = match commit.tree() {
Ok(t) => t,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot get tree; skipping");
continue;
}
};
let parent_tree = if commit.parent_count() > 0 {
match commit.parent(0).and_then(|p| p.tree()) {
Ok(t) => Some(t),
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot get parent tree; skipping");
continue;
}
}
} else {
None
};
let diff = match repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), None) {
Ok(d) => d,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "diff failed; skipping");
continue;
}
};
let file_stats: std::cell::RefCell<Vec<(String, u32, u32)>> =
std::cell::RefCell::new(Vec::new());
let _ = diff.foreach(
&mut |delta, _progress| {
let path = delta
.new_file()
.path()
.or_else(|| delta.old_file().path())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
file_stats.borrow_mut().push((path, 0, 0));
true
},
None,
None,
Some(&mut |delta, _hunk, line| {
let path = delta
.new_file()
.path()
.or_else(|| delta.old_file().path())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
let mut files = file_stats.borrow_mut();
if let Some(entry) = files.iter_mut().find(|e| e.0 == path) {
match line.origin() {
'+' => entry.1 = entry.1.saturating_add(1),
'-' => entry.2 = entry.2.saturating_add(1),
_ => {}
}
}
true
}),
);
let stats_snapshot = file_stats.into_inner();
let file_refs: Vec<(&str, u32, u32)> = stats_snapshot
.iter()
.map(|(p, ins, del)| (p.as_str(), *ins, *del))
.collect();
let effort = compute_effort(file_refs);
let computed_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
records.push(EffortRow {
sha: sha_str,
repository: repo_name.to_string(),
size: effort.size_label().to_string(),
score: effort.score,
loc: effort.loc,
files: effort.files,
test_loc: effort.test_loc,
tests_factor: effort.tests_factor,
formula_version: FORMULA_VERSION.to_string(),
computed_at,
});
if records.len().is_multiple_of(1000) {
tracing::info!(
repo = %repo_name,
processed = records.len(),
"effort backfill progress"
);
}
}
if args.notes && !dry_run {
write_effort_notes(&repo, &records);
}
let mut size_counts = [0usize; 5];
for row in &records {
let idx = match row.size.as_str() {
"XS" => 0,
"S" => 1,
"M" => 2,
"L" => 3,
_ => 4, };
size_counts[idx] += 1;
}
if !dry_run {
persist_effort_rows(db, &records)?;
}
Ok((records.len(), skipped, size_counts))
}
struct EffortRow {
sha: String,
repository: String,
size: String,
score: f64,
loc: u32,
files: u32,
test_loc: u32,
tests_factor: f64,
formula_version: String,
computed_at: i64,
}
fn persist_effort_rows(db: &mut Database, rows: &[EffortRow]) -> anyhow::Result<()> {
for chunk in rows.chunks(1000) {
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut stmt = tx.prepare(
"INSERT OR REPLACE INTO fact_commit_effort \
(sha, repository, size, score, loc, files, test_loc, tests_factor, \
formula_version, computed_at) \
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)",
)?;
for row in chunk {
stmt.execute(params![
row.sha,
row.repository,
row.size,
row.score,
row.loc as i64,
row.files as i64,
row.test_loc as i64,
row.tests_factor,
row.formula_version,
row.computed_at,
])?;
}
}
tx.commit()?;
}
Ok(())
}
fn write_effort_notes(repo: &Repository, rows: &[EffortRow]) {
let sig = match repo.signature() {
Ok(s) => s,
Err(_) => match git2::Signature::now("tga", "tga@localhost") {
Ok(s) => s,
Err(e) => {
tracing::warn!(error = %e, "cannot create git signature for notes; skipping");
return;
}
},
};
for row in rows {
let oid = match git2::Oid::from_str(&row.sha) {
Ok(o) => o,
Err(_) => continue,
};
let note_body = format!("Effort: {}", row.size);
if let Err(e) = repo.note(
&sig,
&sig,
Some("refs/notes/effort"),
oid,
¬e_body,
true, ) {
tracing::warn!(sha = %row.sha, error = %e, "failed to write git note; skipping");
}
}
}
fn backfill_reachability(
config: Config,
db: &mut Database,
args: ReachabilityBackfillArgs,
dry_run: bool,
) -> anyhow::Result<()> {
if dry_run {
println!(
"Dry run — would re-run reachability scan for {} repo(s). No changes written.",
if args.repos.is_empty() {
config.repositories.len()
} else {
args.repos.len()
}
);
return Ok(());
}
let reach_cfg = &config.reachability;
let conn = db.connection();
let mut total_repos = 0usize;
let mut total_rows = 0usize;
let mut total_default_branch = 0usize;
let mut errors: Vec<String> = Vec::new();
for repo_cfg in &config.repositories {
let path = expand_path(&repo_cfg.path);
let name = repo_cfg
.name
.clone()
.or_else(|| {
path.file_name()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
})
.unwrap_or_else(|| path.display().to_string());
if !args.repos.is_empty() && !args.repos.contains(&name) {
continue;
}
total_repos += 1;
tracing::info!(repo = %name, "backfill reachability scan");
match scan_and_persist(&path, conn, reach_cfg, Some(&name)) {
Ok(stats) => {
println!(
" {name}: {} rows upserted \
({} on default branch, {} tagged, {} on release branch)",
stats.rows_upserted,
stats.default_branch_commits,
stats.tagged_commits,
stats.release_branch_commits,
);
total_rows += stats.rows_upserted;
total_default_branch += stats.default_branch_commits;
}
Err(e) => {
let msg = format!(" {name}: reachability scan failed: {e}");
tracing::warn!("{msg}");
errors.push(msg.clone());
println!("{msg}");
}
}
}
println!(
"\nBackfill complete: {total_repos} repos, {total_rows} rows upserted, \
{total_default_branch} commits on default branch."
);
if !errors.is_empty() {
println!("{} repo(s) had errors (see warnings above).", errors.len());
}
Ok(())
}
fn backfill_ai_detection(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let conn = db.connection();
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM commits c \
JOIN classifications cl ON c.classification_id = cl.id \
WHERE cl.method = 'llm' AND COALESCE(c.confidence, cl.confidence) < 0.7",
[],
|row| row.get(0),
)
.unwrap_or(0);
if dry_run {
println!(
"Would re-classify {count} commits (method='llm', confidence<0.7). No changes written."
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
let n = tx.execute(
"UPDATE commits SET classification_id = NULL, confidence = NULL \
WHERE classification_id IN ( \
SELECT id FROM classifications WHERE method = 'llm' \
) AND COALESCE(confidence, 0.0) < 0.7",
[],
)?;
tx.commit()?;
println!(
"Cleared classification on {n} commits — next `tga classify` run will reprocess them."
);
Ok(())
}
fn backfill_revert_flags(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, bool)> = Vec::new();
{
let conn = db.connection();
let mut stmt = conn.prepare("SELECT id, message, is_revert FROM commits")?;
let rows = stmt.query_map([], |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
))
})?;
for r in rows {
let (id, message, current) = r?;
let detected = is_revert(&message);
let target = if detected { 1 } else { 0 };
if target != current {
to_update.push((id, detected));
}
}
}
if dry_run {
println!(
"Would update {} commits ({} would be marked as reverts). No changes written.",
to_update.len(),
to_update.iter().filter(|(_, v)| *v).count(),
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up = tx.prepare("UPDATE commits SET is_revert = ?1 WHERE id = ?2")?;
for (id, flag) in &to_update {
up.execute(params![if *flag { 1 } else { 0 }, id])?;
}
}
tx.commit()?;
println!(
"Updated is_revert on {} commits ({} are reverts).",
to_update.len(),
to_update.iter().filter(|(_, v)| *v).count(),
);
Ok(())
}
fn backfill_ticket_ids(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, Option<String>, i64)> = Vec::new();
{
let conn = db.connection();
let mut stmt = conn.prepare("SELECT id, message, ticket_id, ticketed FROM commits")?;
let rows = stmt.query_map([], |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, Option<String>>(2)?,
row.get::<_, i64>(3)?,
))
})?;
for r in rows {
let (id, message, current_id, current_ticketed) = r?;
let extracted = extract_ticket_id(&message);
let ticketed = if is_ticketed(&message) { 1 } else { 0 };
if extracted != current_id || ticketed != current_ticketed {
to_update.push((id, extracted, ticketed));
}
}
}
if dry_run {
let with_id = to_update.iter().filter(|(_, id, _)| id.is_some()).count();
println!(
"Would update {} commits ({} would gain a ticket_id). No changes written.",
to_update.len(),
with_id,
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up =
tx.prepare("UPDATE commits SET ticket_id = ?1, ticketed = ?2 WHERE id = ?3")?;
for (id, ticket, ticketed) in &to_update {
up.execute(params![ticket, ticketed, id])?;
}
}
tx.commit()?;
let with_id = to_update.iter().filter(|(_, id, _)| id.is_some()).count();
println!(
"Updated {} commits ({} now have a ticket_id).",
to_update.len(),
with_id,
);
Ok(())
}
fn is_revert(message: &str) -> bool {
let trimmed = message.trim_start();
let head = trimmed.as_bytes();
let bound = head.len().min(7);
let prefix = &head[..bound];
prefix.eq_ignore_ascii_case(b"revert ")
|| prefix.eq_ignore_ascii_case(b"revert:")
|| prefix.eq_ignore_ascii_case(b"revert\"")
}
#[cfg(test)]
mod tests {
use super::*;
fn seed(db: &Database, sha: &str, message: &str) {
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES (?1, 'n', 'e', '2024-01-01T00:00:00Z', ?2, 'r')",
params![sha, message],
)
.expect("insert");
}
#[test]
fn revert_detector_matches_expected_forms() {
assert!(is_revert("Revert \"feat: add login\""));
assert!(is_revert("revert: bad merge"));
assert!(is_revert("Revert this change"));
assert!(!is_revert("Refactor revert handling"));
assert!(!is_revert("Fix bug in feature"));
}
#[test]
fn ticket_id_extraction_prefers_specific_patterns() {
assert_eq!(
extract_ticket_id("AB#42 implement"),
Some("AB#42".to_string())
);
assert_eq!(
extract_ticket_id("ENG-123: feature"),
Some("ENG-123".to_string())
);
assert_eq!(extract_ticket_id("fixes #99"), Some("#99".to_string()));
assert_eq!(extract_ticket_id("misc cleanup"), None);
}
#[test]
fn backfill_revert_flags_updates_only_changed_rows() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "Revert \"foo\"");
seed(&db, "b", "feat: thing");
backfill_revert_flags(&mut db, false).expect("backfill");
let reverts: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM commits WHERE is_revert = 1",
[],
|r| r.get(0),
)
.expect("q");
assert_eq!(reverts, 1);
}
#[test]
fn backfill_ticket_ids_populates_ticket_id() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "ENG-7: thing");
seed(&db, "b", "no ticket");
backfill_ticket_ids(&mut db, false).expect("backfill");
let t: Option<String> = db
.connection()
.query_row("SELECT ticket_id FROM commits WHERE sha = 'a'", [], |r| {
r.get(0)
})
.expect("q");
assert_eq!(t, Some("ENG-7".to_string()));
let n: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM commits WHERE ticketed = 1", [], |r| {
r.get(0)
})
.expect("q");
assert_eq!(n, 1);
}
#[test]
fn dry_run_does_not_modify_rows() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "Revert \"foo\"");
backfill_revert_flags(&mut db, true).expect("dry run");
let reverts: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM commits WHERE is_revert = 1",
[],
|r| r.get(0),
)
.expect("q");
assert_eq!(reverts, 0);
}
#[test]
fn backfill_effort_persists_rows() {
let mut db = Database::open_in_memory().expect("open");
let rows = vec![EffortRow {
sha: "abc123".to_string(),
repository: "testrepo".to_string(),
size: "M".to_string(),
score: 9.1,
loc: 50,
files: 2,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
}];
persist_effort_rows(&mut db, &rows).expect("persist");
let (size, score, loc, files): (String, f64, i64, i64) = db
.connection()
.query_row(
"SELECT size, score, loc, files \
FROM fact_commit_effort WHERE sha = 'abc123'",
[],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
)
.expect("query");
assert_eq!(size, "M");
assert!((score - 9.1).abs() < 0.001);
assert_eq!(loc, 50);
assert_eq!(files, 2);
}
#[test]
fn backfill_effort_force_recomputes() {
let mut db = Database::open_in_memory().expect("open");
let first = vec![EffortRow {
sha: "deadbeef".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 1.0,
loc: 1,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
}];
persist_effort_rows(&mut db, &first).expect("first persist");
let second = vec![EffortRow {
sha: "deadbeef".to_string(),
repository: "repo".to_string(),
size: "XL".to_string(),
score: 99.9,
loc: 100_000,
files: 500,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 2_000_000,
}];
persist_effort_rows(&mut db, &second).expect("second persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE sha = 'deadbeef'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 1, "UPSERT must not create duplicate rows");
let score: f64 = db
.connection()
.query_row(
"SELECT score FROM fact_commit_effort WHERE sha = 'deadbeef'",
[],
|r| r.get(0),
)
.expect("score");
assert!(
(score - 99.9).abs() < 0.001,
"score must be updated to 99.9"
);
}
#[test]
fn backfill_effort_same_sha_different_repos() {
let mut db = Database::open_in_memory().expect("open");
let rows = vec![
EffortRow {
sha: "cafebabe".to_string(),
repository: "repo-a".to_string(),
size: "S".to_string(),
score: 5.5,
loc: 30,
files: 2,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
},
EffortRow {
sha: "cafebabe".to_string(),
repository: "repo-b".to_string(),
size: "M".to_string(),
score: 8.0,
loc: 60,
files: 3,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
},
];
persist_effort_rows(&mut db, &rows).expect("persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE sha = 'cafebabe'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 2, "same SHA in two repos must produce two rows");
}
#[test]
fn backfill_effort_empty_produces_no_rows() {
let mut db = Database::open_in_memory().expect("open");
persist_effort_rows(&mut db, &[]).expect("empty persist");
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_commit_effort", [], |r| r.get(0))
.expect("count");
assert_eq!(count, 0);
}
fn seed_commit_with_files(
db: &Database,
sha: &str,
repo: &str,
timestamp: &str,
files: &[(&str, u32, u32)], ) -> i64 {
let conn = db.connection();
conn.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES (?1, 'tester', 'test@example.com', ?2, 'msg', ?3)",
params![sha, timestamp, repo],
)
.expect("insert commit");
let commit_id = conn.last_insert_rowid();
for (path, ins, del) in files {
conn.execute(
"INSERT INTO files (commit_id, path, change_type, insertions, deletions) \
VALUES (?1, ?2, 'modified', ?3, ?4)",
params![commit_id, path, ins, del],
)
.expect("insert file");
}
commit_id
}
#[test]
fn backfill_effort_db_path_populates_fact_table() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"aaa111",
"myrepo",
"2024-01-01T00:00:00Z",
&[("src/main.rs", 30, 10), ("src/lib.rs", 5, 2)],
);
seed_commit_with_files(
&db,
"bbb222",
"myrepo",
"2024-01-02T00:00:00Z",
&[("src/tests/foo_test.rs", 20, 0)],
);
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "myrepo", &args, false).expect("db path");
assert_eq!(scored, 2, "both commits should be scored");
assert_eq!(skipped, 0, "nothing pre-scored");
persist_effort_rows(&mut db, &rows).expect("persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE repository = 'myrepo'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 2, "two effort rows expected");
let (size_b, tests_factor_b): (String, f64) = db
.connection()
.query_row(
"SELECT size, tests_factor FROM fact_commit_effort WHERE sha = 'bbb222'",
[],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.expect("bbb222 row");
assert!(
(tests_factor_b - 0.7).abs() < 1e-6,
"expected tests_factor=0.7 for all-test commit, got {tests_factor_b}"
);
assert_eq!(size_b, "S", "all-test commit should be S");
}
#[test]
fn backfill_effort_db_path_skips_already_scored() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"scored111",
"repo",
"2024-01-01T00:00:00Z",
&[("src/a.rs", 10, 0)],
);
seed_commit_with_files(
&db,
"unscored222",
"repo",
"2024-01-02T00:00:00Z",
&[("src/b.rs", 5, 5)],
);
let pre = vec![EffortRow {
sha: "scored111".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 1.0,
loc: 10,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 0,
}];
persist_effort_rows(&mut db, &pre).expect("pre-persist");
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 1, "only unscored222 should be scored");
assert_eq!(skipped, 1, "scored111 should be skipped");
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].sha, "unscored222");
}
#[test]
fn backfill_effort_db_path_force_rescores_all() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"sha001",
"repo",
"2024-01-01T00:00:00Z",
&[("src/x.rs", 100, 50)],
);
let stale = vec![EffortRow {
sha: "sha001".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 0.1,
loc: 1,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: "v0".to_string(),
computed_at: 0,
}];
persist_effort_rows(&mut db, &stale).expect("stale persist");
let args = EffortBackfillArgs {
repo: None,
range: None,
force: true, notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 1, "force path should score the commit");
assert_eq!(skipped, 0, "nothing should be skipped with --force");
assert!(
rows[0].score > 1.0,
"re-scored effort should be higher than stale 0.1"
);
}
#[test]
fn backfill_effort_db_path_skips_commit_with_no_files() {
let db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES ('empty001', 'tester', 'test@example.com', '2024-01-01T00:00:00Z', 'empty', 'repo')",
[],
)
.expect("insert commit");
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 0, "commit with no files should produce no records");
assert_eq!(skipped, 0);
assert!(rows.is_empty());
}
#[test]
fn backfill_effort_db_path_respects_limit() {
let db = Database::open_in_memory().expect("open");
for i in 0..5u32 {
seed_commit_with_files(
&db,
&format!("sha{i:03}"),
"repo",
&format!("2024-01-{:02}T00:00:00Z", i + 1),
&[("src/foo.rs", 10, 5)],
);
}
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: Some(3),
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 3, "limit=3 should cap at 3 records");
assert_eq!(rows.len(), 3);
}
#[test]
fn backfill_effort_db_path_scoped_to_repo() {
let db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"alpha001",
"repo-alpha",
"2024-01-01T00:00:00Z",
&[("src/a.rs", 20, 10)],
);
seed_commit_with_files(
&db,
"beta001",
"repo-beta",
"2024-01-01T00:00:00Z",
&[("src/b.rs", 50, 20)],
);
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo-alpha", &args, false).expect("db path");
assert_eq!(scored, 1);
assert_eq!(rows[0].sha, "alpha001");
assert_eq!(rows[0].repository, "repo-alpha");
}
#[test]
fn backfill_effort_db_path_dry_run_returns_rows_without_persisting() {
let db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"drysha1",
"repo",
"2024-01-01T00:00:00Z",
&[("src/main.rs", 40, 10)],
);
let args = EffortBackfillArgs {
repo: None,
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, true )
.expect("db path");
assert_eq!(
scored, 1,
"db path should return 1 scored row even in dry_run"
);
assert_eq!(rows.len(), 1);
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_commit_effort", [], |r| r.get(0))
.expect("count");
assert_eq!(count, 0, "dry_run must not write to fact_commit_effort");
}
}