use clap::{Args, Subcommand};
use git2::{Repository, Sort};
use rusqlite::{params, Connection};
use tga::classify::taxonomy::TaxonomyRegistry;
use tga::classify::ClassificationPipeline;
use tga::collect::ai_attribution::detect_ai_tool;
use tga::collect::git::scan_and_persist;
use tga::collect::ticket::{extract_ticket_id, is_ticketed};
use tga::core::config::{expand_path, Config};
use tga::core::db::{CheckpointMode, Database};
use tga::core::effort::{compute_effort, effort_tshirt_from_size, FORMULA_VERSION};
use tga::report::aggregator::Aggregator;
#[derive(Args, Debug)]
#[command(
about = "Retroactive maintenance operations on existing commit rows.",
long_about = "Re-run extraction or scoring steps on commits already in the database.\n\n\
These operations update existing rows in-place rather than ingesting new data.\n\
Each subcommand supports --dry-run to preview changes without writing.\n\n\
NOTE: --branch is collect-only. Commits in the DB do not carry branch\n\
attribution after the walk, so there is no branch filter on backfill operations.\n\
If you need to re-walk specific branches, use `tga collect --branch <name>`.\n\n\
TIPS:\n\
- Use --repos to limit scope to one service at a time on large corpora.\n\
- Use --since/--until or --weeks to limit the date window for fast iteration.",
after_help = "EXAMPLES:\n\
# Re-extract ticket IDs for all commits (after pattern change)\n\
tga backfill ticket-ids\n\n\
# Re-score effort for the last 4 weeks of one repo\n\
tga backfill effort --repos my-service --weeks 4 --force\n\n\
# Re-run reachability scan after adding release-branch patterns\n\
tga backfill reachability --repos core-api"
)]
pub struct BackfillArgs {
#[command(subcommand)]
pub subcommand: BackfillSubcommand,
#[arg(long, default_value_t = false, global = true)]
pub dry_run: bool,
#[arg(long, value_delimiter = ',', global = true)]
pub repos: Vec<String>,
#[arg(long, value_name = "N", global = true, conflicts_with_all = ["since", "until"])]
pub weeks: Option<u32>,
#[arg(long, value_name = "DATE", global = true, conflicts_with = "weeks")]
pub since: Option<String>,
#[arg(long, value_name = "DATE", global = true, conflicts_with = "weeks")]
pub until: Option<String>,
}
#[derive(Subcommand, Debug)]
pub enum BackfillSubcommand {
AiDetection,
RevertFlags,
TicketIds,
Reachability,
Effort(EffortBackfillArgs),
Complexity(ComplexityBackfillArgs),
Ticketed,
AiDetectionCommits,
TopLevel,
EffortTshirt,
Quality,
}
#[derive(Args, Debug)]
pub struct ComplexityBackfillArgs {
#[arg(long, default_value_t = false)]
pub use_llm: bool,
}
#[derive(Args, Debug)]
pub struct EffortBackfillArgs {
#[arg(long, value_name = "RANGE")]
pub range: Option<String>,
#[arg(long, default_value_t = false)]
pub force: bool,
#[arg(long, default_value_t = false)]
pub notes: bool,
#[arg(long, value_name = "N")]
pub limit: Option<usize>,
}
fn resolve_backfill_date_range(
args: &BackfillArgs,
) -> anyhow::Result<(Option<String>, Option<String>)> {
use crate::commands::date_range::resolve_date_range;
resolve_date_range(
args.weeks,
args.since.as_deref(),
args.until.as_deref(),
None,
)
}
pub async fn run(config: Config, db: &mut Database, args: BackfillArgs) -> anyhow::Result<()> {
let (since, until) = resolve_backfill_date_range(&args)?;
let repos = args.repos.clone();
match args.subcommand {
BackfillSubcommand::AiDetection => backfill_ai_detection(db, args.dry_run),
BackfillSubcommand::RevertFlags => {
backfill_revert_flags(db, args.dry_run, &repos, since.as_deref(), until.as_deref())
}
BackfillSubcommand::TicketIds => {
backfill_ticket_ids(db, args.dry_run, &repos, since.as_deref(), until.as_deref())
}
BackfillSubcommand::Reachability => backfill_reachability(config, db, &repos, args.dry_run),
BackfillSubcommand::Effort(effort_args) => backfill_effort(
config,
db,
effort_args,
&repos,
since.as_deref(),
until.as_deref(),
args.dry_run,
),
BackfillSubcommand::Complexity(complexity_args) => {
backfill_complexity(config, db, complexity_args, args.dry_run).await
}
BackfillSubcommand::Ticketed => {
backfill_ticketed(db, args.dry_run, &repos, since.as_deref(), until.as_deref())
}
BackfillSubcommand::AiDetectionCommits => backfill_ai_detection_commits(
db,
args.dry_run,
&repos,
since.as_deref(),
until.as_deref(),
),
BackfillSubcommand::TopLevel => backfill_top_level(db, args.dry_run),
BackfillSubcommand::EffortTshirt => backfill_effort_tshirt(db, args.dry_run),
BackfillSubcommand::Quality => backfill_quality(db, args.dry_run),
}
}
async fn backfill_complexity(
config: Config,
db: &mut Database,
args: ComplexityBackfillArgs,
dry_run: bool,
) -> anyhow::Result<()> {
if dry_run {
let candidates: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM classifications \
WHERE complexity IS NULL AND method != 'exact_rule'",
[],
|row| row.get(0),
)
.unwrap_or(0);
println!(
"Dry run — would request complexity scores for {candidates} classification(s) \
(complexity IS NULL, method != 'exact_rule'). No changes written."
);
return Ok(());
}
let mut cfg = config;
if args.use_llm {
let classification = cfg
.classification
.get_or_insert_with(tga::core::config::ClassificationConfig::default);
classification.use_llm = true;
}
let pipeline = ClassificationPipeline::new(cfg);
let updated = pipeline.backfill_complexity(db).await?;
println!("Backfilled complexity for {updated} commit(s)");
if let Err(e) = db.wal_checkpoint(CheckpointMode::Truncate) {
tracing::warn!(error = %e, "WAL TRUNCATE checkpoint failed after complexity backfill");
}
Ok(())
}
fn backfill_effort(
config: Config,
db: &mut Database,
args: EffortBackfillArgs,
repos_filter: &[String],
since: Option<&str>,
until: Option<&str>,
dry_run: bool,
) -> anyhow::Result<()> {
let repos_to_process: Vec<(std::path::PathBuf, String)> = config
.repositories
.iter()
.filter_map(|repo_cfg| {
let path = expand_path(&repo_cfg.path);
let name = repo_cfg
.name
.clone()
.or_else(|| {
path.file_name()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
})
.unwrap_or_else(|| path.display().to_string());
if !repos_filter.is_empty() && !repos_filter.contains(&name) {
return None;
}
Some((path, name))
})
.collect();
if since.is_some() || until.is_some() {
tracing::info!(
since = ?since,
until = ?until,
"effort backfill: applying date window filter (--since/--until/--weeks)"
);
tracing::warn!(
"effort backfill: --since/--until/--weeks filters affect the log output only;\n\
the db-only path queries all commits for each repo via `commits JOIN files`.\n\
For precise date-scoped effort scoring use --range on the git path."
);
}
if repos_to_process.is_empty() {
println!("No matching repositories found in config.");
return Ok(());
}
let use_git_path = args.range.is_some() || args.notes;
let _ = since; let _ = until;
let mut total_scored: usize = 0;
let mut total_skipped: usize = 0;
let mut total_repos: usize = 0;
let mut size_counts = [0usize; 5];
for (repo_path, repo_name) in &repos_to_process {
let result = if use_git_path {
process_one_repo_git(repo_path, repo_name, db, &args, dry_run)
} else {
process_one_repo_db(db.connection(), repo_name, &args, dry_run).and_then(
|(scored, skipped, sizes, rows)| {
if !dry_run {
persist_effort_rows(db, &rows)?;
}
Ok((scored, skipped, sizes))
},
)
};
match result {
Ok((scored, skipped, sizes)) => {
total_repos += 1;
total_scored += scored;
total_skipped += skipped;
for i in 0..5 {
size_counts[i] += sizes[i];
}
let verb = if dry_run { "would score" } else { "scored" };
println!(
" {repo_name}: {verb} {scored} commits, skipped {skipped} already-scored"
);
}
Err(e) => {
tracing::warn!(repo = %repo_name, error = %e, "backfill effort failed for repo");
println!(" {repo_name}: error — {e}");
}
}
}
let verb = if dry_run { "Would score" } else { "Scored" };
println!(
"\nBackfill complete: {total_repos} repos, {verb} {total_scored} commits \
({} skipped already-scored).",
total_skipped,
);
println!(
" Size distribution: XS={} S={} M={} L={} XL={}",
size_counts[0], size_counts[1], size_counts[2], size_counts[3], size_counts[4],
);
Ok(())
}
fn process_one_repo_db(
conn: &Connection,
repo_name: &str,
args: &EffortBackfillArgs,
dry_run: bool,
) -> anyhow::Result<(usize, usize, [usize; 5], Vec<EffortRow>)> {
let already_scored: std::collections::HashSet<String> = if args.force {
std::collections::HashSet::new()
} else {
let mut stmt = conn.prepare("SELECT sha FROM fact_commit_effort WHERE repository = ?1")?;
let rows = stmt.query_map(params![repo_name], |row| row.get::<_, String>(0))?;
let mut set = std::collections::HashSet::new();
for r in rows {
set.insert(r?);
}
set
};
let in_db: i64 = conn
.query_row(
"SELECT COUNT(DISTINCT c.sha) FROM commits c WHERE c.repository = ?1",
params![repo_name],
|r| r.get(0),
)
.unwrap_or(0);
tracing::info!(
repo = %repo_name,
in_db = in_db,
already_scored = already_scored.len(),
"effort backfill db path: starting"
);
let mut stmt = conn.prepare(
"SELECT c.sha, f.path, f.insertions, f.deletions \
FROM commits c \
JOIN files f ON f.commit_id = c.id \
WHERE c.repository = ?1 \
ORDER BY c.timestamp ASC, c.sha ASC",
)?;
let limit = args.limit.unwrap_or(usize::MAX);
let mut records: Vec<EffortRow> = Vec::new();
let mut skipped: usize = 0;
let mut current_sha: Option<String> = None;
let mut current_files: Vec<(String, u32, u32)> = Vec::new();
let flush = |sha: &str,
files: &[(String, u32, u32)],
already_scored: &std::collections::HashSet<String>,
records: &mut Vec<EffortRow>,
skipped: &mut usize|
-> bool {
if records.len() >= limit {
return false;
}
if already_scored.contains(sha) {
*skipped += 1;
return true; }
if files.is_empty() {
tracing::warn!(
sha = %sha,
"commit has no rows in the files table; skipping effort computation"
);
return true;
}
let file_refs: Vec<(&str, u32, u32)> =
files.iter().map(|(p, i, d)| (p.as_str(), *i, *d)).collect();
let effort = compute_effort(file_refs);
let computed_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
records.push(EffortRow {
sha: sha.to_string(),
repository: repo_name.to_string(),
size: effort.size_label().to_string(),
score: effort.score,
loc: effort.loc,
files: effort.files,
test_loc: effort.test_loc,
tests_factor: effort.tests_factor,
formula_version: FORMULA_VERSION.to_string(),
computed_at,
effort_tshirt: effort_tshirt_from_size(effort.size_label()),
});
if records.len().is_multiple_of(1000) {
tracing::info!(
repo = %repo_name,
processed = records.len(),
"effort backfill db path: progress"
);
}
true
};
let rows = stmt.query_map(params![repo_name], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, u32>(2)?,
row.get::<_, u32>(3)?,
))
})?;
for row_res in rows {
let (sha, path, ins, del) = row_res?;
match ¤t_sha {
None => {
current_sha = Some(sha.clone());
current_files.push((path, ins, del));
}
Some(cur) if cur == &sha => {
current_files.push((path, ins, del));
}
Some(_) => {
let prev_sha = current_sha.take().expect("just checked Some");
let should_continue = flush(
&prev_sha,
¤t_files,
&already_scored,
&mut records,
&mut skipped,
);
current_files.clear();
if !should_continue || records.len() >= limit {
break;
}
current_sha = Some(sha.clone());
current_files.push((path, ins, del));
}
}
}
if let Some(last_sha) = current_sha.take() {
if records.len() < limit {
flush(
&last_sha,
¤t_files,
&already_scored,
&mut records,
&mut skipped,
);
}
}
let mut size_counts = [0usize; 5];
for row in &records {
let idx = match row.size.as_str() {
"XS" => 0,
"S" => 1,
"M" => 2,
"L" => 3,
_ => 4, };
size_counts[idx] += 1;
}
tracing::info!(
repo = %repo_name,
in_db = in_db,
scored = records.len(),
skipped = skipped,
dry_run = dry_run,
"effort backfill db path: complete"
);
Ok((records.len(), skipped, size_counts, records))
}
fn process_one_repo_git(
repo_path: &std::path::Path,
repo_name: &str,
db: &mut Database,
args: &EffortBackfillArgs,
dry_run: bool,
) -> anyhow::Result<(usize, usize, [usize; 5])> {
let repo = Repository::open(repo_path)
.map_err(|e| anyhow::anyhow!("cannot open git repo {}: {e}", repo_path.display()))?;
let already_scored: std::collections::HashSet<String> = if args.force {
std::collections::HashSet::new()
} else {
let conn = db.connection();
let mut stmt = conn.prepare("SELECT sha FROM fact_commit_effort WHERE repository = ?1")?;
let rows = stmt.query_map(params![repo_name], |row| row.get::<_, String>(0))?;
let mut set = std::collections::HashSet::new();
for r in rows {
set.insert(r?);
}
set
};
let mut revwalk = repo.revwalk()?;
revwalk.set_sorting(Sort::TIME)?;
if let Some(ref range) = args.range {
if let Some((base, tip)) = range.split_once("..") {
let tip_oid = repo
.revparse_single(tip.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{tip}': {e}"))?
.id();
revwalk.push(tip_oid)?;
if !base.trim().is_empty() {
let base_oid = repo
.revparse_single(base.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{base}': {e}"))?
.id();
revwalk.hide(base_oid)?;
}
} else {
let oid = repo
.revparse_single(range.trim())
.map_err(|e| anyhow::anyhow!("cannot resolve git ref '{range}': {e}"))?
.id();
revwalk.push(oid)?;
}
} else {
let _ = revwalk.push_head();
}
let mut records: Vec<EffortRow> = Vec::new();
let mut skipped: usize = 0;
let limit = args.limit.unwrap_or(usize::MAX);
for oid_res in revwalk {
if records.len() >= limit {
break;
}
let oid = match oid_res {
Ok(o) => o,
Err(e) => {
tracing::warn!(repo = %repo_name, error = %e, "revwalk error; stopping");
break;
}
};
let sha_str = oid.to_string();
if already_scored.contains(&sha_str) {
skipped += 1;
continue;
}
let commit = match repo.find_commit(oid) {
Ok(c) => c,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot find commit; skipping");
continue;
}
};
let tree = match commit.tree() {
Ok(t) => t,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot get tree; skipping");
continue;
}
};
let parent_tree = if commit.parent_count() > 0 {
match commit.parent(0).and_then(|p| p.tree()) {
Ok(t) => Some(t),
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "cannot get parent tree; skipping");
continue;
}
}
} else {
None
};
let diff = match repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), None) {
Ok(d) => d,
Err(e) => {
tracing::warn!(sha = %sha_str, error = %e, "diff failed; skipping");
continue;
}
};
let file_stats: std::cell::RefCell<Vec<(String, u32, u32)>> =
std::cell::RefCell::new(Vec::new());
let _ = diff.foreach(
&mut |delta, _progress| {
let path = delta
.new_file()
.path()
.or_else(|| delta.old_file().path())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
file_stats.borrow_mut().push((path, 0, 0));
true
},
None,
None,
Some(&mut |delta, _hunk, line| {
let path = delta
.new_file()
.path()
.or_else(|| delta.old_file().path())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
let mut files = file_stats.borrow_mut();
if let Some(entry) = files.iter_mut().find(|e| e.0 == path) {
match line.origin() {
'+' => entry.1 = entry.1.saturating_add(1),
'-' => entry.2 = entry.2.saturating_add(1),
_ => {}
}
}
true
}),
);
let stats_snapshot = file_stats.into_inner();
let file_refs: Vec<(&str, u32, u32)> = stats_snapshot
.iter()
.map(|(p, ins, del)| (p.as_str(), *ins, *del))
.collect();
let effort = compute_effort(file_refs);
let computed_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
records.push(EffortRow {
sha: sha_str,
repository: repo_name.to_string(),
size: effort.size_label().to_string(),
score: effort.score,
loc: effort.loc,
files: effort.files,
test_loc: effort.test_loc,
tests_factor: effort.tests_factor,
formula_version: FORMULA_VERSION.to_string(),
computed_at,
effort_tshirt: effort_tshirt_from_size(effort.size_label()),
});
if records.len().is_multiple_of(1000) {
tracing::info!(
repo = %repo_name,
processed = records.len(),
"effort backfill progress"
);
}
}
if args.notes && !dry_run {
write_effort_notes(&repo, &records);
}
let mut size_counts = [0usize; 5];
for row in &records {
let idx = match row.size.as_str() {
"XS" => 0,
"S" => 1,
"M" => 2,
"L" => 3,
_ => 4, };
size_counts[idx] += 1;
}
if !dry_run {
persist_effort_rows(db, &records)?;
}
Ok((records.len(), skipped, size_counts))
}
struct EffortRow {
sha: String,
repository: String,
size: String,
score: f64,
loc: u32,
files: u32,
test_loc: u32,
tests_factor: f64,
formula_version: String,
computed_at: i64,
#[allow(dead_code)]
effort_tshirt: i64,
}
fn persist_effort_rows(db: &mut Database, rows: &[EffortRow]) -> anyhow::Result<()> {
let thresholds = tga::core::effort_percentile::load_thresholds(db.connection()).unwrap_or(None);
for chunk in rows.chunks(1000) {
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut stmt = tx.prepare(
"INSERT OR REPLACE INTO fact_commit_effort \
(sha, repository, size, score, loc, files, test_loc, tests_factor, \
formula_version, computed_at, effort_tshirt) \
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)",
)?;
for row in chunk {
let tshirt = match &thresholds {
Some(t) => t.band_for_score(row.score),
None => effort_tshirt_from_size(&row.size),
};
stmt.execute(params![
row.sha,
row.repository,
row.size,
row.score,
row.loc as i64,
row.files as i64,
row.test_loc as i64,
row.tests_factor,
row.formula_version,
row.computed_at,
tshirt,
])?;
}
}
tx.commit()?;
}
Ok(())
}
fn write_effort_notes(repo: &Repository, rows: &[EffortRow]) {
let sig = match repo.signature() {
Ok(s) => s,
Err(_) => match git2::Signature::now("tga", "tga@localhost") {
Ok(s) => s,
Err(e) => {
tracing::warn!(error = %e, "cannot create git signature for notes; skipping");
return;
}
},
};
for row in rows {
let oid = match git2::Oid::from_str(&row.sha) {
Ok(o) => o,
Err(_) => continue,
};
let note_body = format!("Effort: {}", row.size);
if let Err(e) = repo.note(
&sig,
&sig,
Some("refs/notes/effort"),
oid,
¬e_body,
true, ) {
tracing::warn!(sha = %row.sha, error = %e, "failed to write git note; skipping");
}
}
}
fn backfill_reachability(
config: Config,
db: &mut Database,
repos_filter: &[String],
dry_run: bool,
) -> anyhow::Result<()> {
if dry_run {
println!(
"Dry run — would re-run reachability scan for {} repo(s). No changes written.",
if repos_filter.is_empty() {
config.repositories.len()
} else {
repos_filter.len()
}
);
return Ok(());
}
let reach_cfg = &config.reachability;
let conn = db.connection();
let mut total_repos = 0usize;
let mut total_rows = 0usize;
let mut total_default_branch = 0usize;
let mut errors: Vec<String> = Vec::new();
for repo_cfg in &config.repositories {
let path = expand_path(&repo_cfg.path);
let name = repo_cfg
.name
.clone()
.or_else(|| {
path.file_name()
.and_then(|s| s.to_str())
.map(|s| s.to_string())
})
.unwrap_or_else(|| path.display().to_string());
if !repos_filter.is_empty() && !repos_filter.contains(&name) {
continue;
}
total_repos += 1;
tracing::info!(repo = %name, "backfill reachability scan");
match scan_and_persist(&path, conn, reach_cfg, Some(&name)) {
Ok(stats) => {
println!(
" {name}: {} rows upserted \
({} on default branch, {} tagged, {} on release branch)",
stats.rows_upserted,
stats.default_branch_commits,
stats.tagged_commits,
stats.release_branch_commits,
);
total_rows += stats.rows_upserted;
total_default_branch += stats.default_branch_commits;
}
Err(e) => {
let msg = format!(" {name}: reachability scan failed: {e}");
tracing::warn!("{msg}");
errors.push(msg.clone());
println!("{msg}");
}
}
}
println!(
"\nBackfill complete: {total_repos} repos, {total_rows} rows upserted, \
{total_default_branch} commits on default branch."
);
if !errors.is_empty() {
println!("{} repo(s) had errors (see warnings above).", errors.len());
}
Ok(())
}
fn backfill_ai_detection(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let conn = db.connection();
let count: i64 = conn
.query_row(
"SELECT COUNT(*) FROM commits c \
JOIN classifications cl ON c.classification_id = cl.id \
WHERE cl.method = 'llm' AND COALESCE(c.confidence, cl.confidence) < 0.7",
[],
|row| row.get(0),
)
.unwrap_or(0);
if dry_run {
println!(
"Would re-classify {count} commits (method='llm', confidence<0.7). No changes written."
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
let n = tx.execute(
"UPDATE commits SET classification_id = NULL, confidence = NULL \
WHERE classification_id IN ( \
SELECT id FROM classifications WHERE method = 'llm' \
) AND COALESCE(confidence, 0.0) < 0.7",
[],
)?;
tx.commit()?;
println!(
"Cleared classification on {n} commits — next `tga classify` run will reprocess them."
);
Ok(())
}
fn backfill_revert_flags(
db: &mut Database,
dry_run: bool,
repos_filter: &[String],
since: Option<&str>,
until: Option<&str>,
) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, bool)> = Vec::new();
{
let conn = db.connection();
let (sql, params) = build_commits_filter_sql(
"SELECT id, message, is_revert FROM commits",
repos_filter,
since,
until,
);
let mut stmt = conn.prepare(&sql)?;
let rows = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
))
})?;
for r in rows {
let (id, message, current) = r?;
let detected = is_revert(&message);
let target = if detected { 1 } else { 0 };
if target != current {
to_update.push((id, detected));
}
}
}
if dry_run {
println!(
"Would update {} commits ({} would be marked as reverts). No changes written.",
to_update.len(),
to_update.iter().filter(|(_, v)| *v).count(),
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up = tx.prepare("UPDATE commits SET is_revert = ?1 WHERE id = ?2")?;
for (id, flag) in &to_update {
up.execute(params![if *flag { 1 } else { 0 }, id])?;
}
}
tx.commit()?;
println!(
"Updated is_revert on {} commits ({} are reverts).",
to_update.len(),
to_update.iter().filter(|(_, v)| *v).count(),
);
Ok(())
}
fn backfill_ticket_ids(
db: &mut Database,
dry_run: bool,
repos_filter: &[String],
since: Option<&str>,
until: Option<&str>,
) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, Option<String>, i64)> = Vec::new();
{
let conn = db.connection();
let (sql, params) = build_commits_filter_sql(
"SELECT id, message, ticket_id, ticketed FROM commits",
repos_filter,
since,
until,
);
let mut stmt = conn.prepare(&sql)?;
let rows = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, Option<String>>(2)?,
row.get::<_, i64>(3)?,
))
})?;
for r in rows {
let (id, message, current_id, current_ticketed) = r?;
let extracted = extract_ticket_id(&message);
let ticketed = if is_ticketed(&message) { 1 } else { 0 };
if extracted != current_id || ticketed != current_ticketed {
to_update.push((id, extracted, ticketed));
}
}
}
if dry_run {
let with_id = to_update.iter().filter(|(_, id, _)| id.is_some()).count();
println!(
"Would update {} commits ({} would gain a ticket_id). No changes written.",
to_update.len(),
with_id,
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up =
tx.prepare("UPDATE commits SET ticket_id = ?1, ticketed = ?2 WHERE id = ?3")?;
for (id, ticket, ticketed) in &to_update {
up.execute(params![ticket, ticketed, id])?;
}
}
tx.commit()?;
let with_id = to_update.iter().filter(|(_, id, _)| id.is_some()).count();
println!(
"Updated {} commits ({} now have a ticket_id).",
to_update.len(),
with_id,
);
Ok(())
}
fn build_commits_filter_sql(
base_sql: &str,
repos: &[String],
since: Option<&str>,
until: Option<&str>,
) -> (String, Vec<rusqlite::types::Value>) {
use rusqlite::types::Value;
let mut predicates: Vec<String> = Vec::new();
let mut params: Vec<Value> = Vec::new();
if !repos.is_empty() {
let start = params.len() + 1;
for r in repos {
params.push(Value::Text(r.clone()));
}
let end = params.len();
let placeholders: Vec<String> = (start..=end).map(|i| format!("?{i}")).collect();
predicates.push(format!("repository IN ({})", placeholders.join(", ")));
}
if let Some(s) = since {
params.push(Value::Text(s.to_string()));
predicates.push(format!("timestamp >= ?{}", params.len()));
}
if let Some(u) = until {
params.push(Value::Text(u.to_string()));
predicates.push(format!("timestamp <= ?{}", params.len()));
}
let sql = if predicates.is_empty() {
base_sql.to_string()
} else {
format!("{base_sql} WHERE {}", predicates.join(" AND "))
};
(sql, params)
}
fn is_revert(message: &str) -> bool {
tga::core::revert::is_revert(message)
}
fn backfill_ticketed(
db: &mut Database,
dry_run: bool,
repos_filter: &[String],
since: Option<&str>,
until: Option<&str>,
) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, i64)> = Vec::new();
{
let conn = db.connection();
let (sql, params) = build_commits_filter_sql(
"SELECT id, message, ticketed FROM commits",
repos_filter,
since,
until,
);
let mut stmt = conn.prepare(&sql)?;
let rows = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
))
})?;
for r in rows {
let (id, message, current) = r?;
let new_val = if is_ticketed(&message) { 1 } else { 0 };
if new_val != current {
to_update.push((id, new_val));
}
}
}
let now_ticketed = to_update.iter().filter(|(_, v)| *v == 1).count();
let now_unticketed = to_update.iter().filter(|(_, v)| *v == 0).count();
if dry_run {
println!(
"Dry run — would update {} commits \
({} newly ticketed, {} newly unticketed). No changes written.",
to_update.len(),
now_ticketed,
now_unticketed,
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up = tx.prepare("UPDATE commits SET ticketed = ?1 WHERE id = ?2")?;
for (id, val) in &to_update {
up.execute(params![val, id])?;
}
}
tx.commit()?;
println!(
"Updated ticketed on {} commits \
({} newly ticketed, {} newly unticketed).",
to_update.len(),
now_ticketed,
now_unticketed,
);
Ok(())
}
fn backfill_ai_detection_commits(
db: &mut Database,
dry_run: bool,
repos_filter: &[String],
since: Option<&str>,
until: Option<&str>,
) -> anyhow::Result<()> {
let mut to_update: Vec<(i64, i64, Option<&'static str>)> = Vec::new();
{
let conn = db.connection();
let (sql, params) = build_commits_filter_sql(
"SELECT id, message, ai_tool FROM commits",
repos_filter,
since,
until,
);
let mut stmt = conn.prepare(&sql)?;
let rows: Vec<(i64, String, Option<String>)> = stmt
.query_map(rusqlite::params_from_iter(params.iter()), |row| {
Ok((
row.get::<_, i64>(0)?,
row.get::<_, String>(1)?,
row.get::<_, Option<String>>(2)?,
))
})?
.collect::<Result<_, _>>()?;
for (id, message, current_tool) in rows {
let detected = detect_ai_tool(&message);
let current_str = current_tool.as_deref();
if detected != current_str {
let is_ai = if detected.is_some() { 1_i64 } else { 0_i64 };
to_update.push((id, is_ai, detected));
}
}
}
let with_tool = to_update.iter().filter(|(_, _, t)| t.is_some()).count();
if dry_run {
println!(
"Dry run — would update {} commits ({} with AI tool detected). No changes written.",
to_update.len(),
with_tool,
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up =
tx.prepare("UPDATE commits SET is_ai_assisted = ?1, ai_tool = ?2 WHERE id = ?3")?;
for (id, is_ai, tool) in &to_update {
up.execute(params![is_ai, tool, id])?;
}
}
tx.commit()?;
println!(
"Updated {} commits ({} AI-assisted, {} cleared).",
to_update.len(),
with_tool,
to_update.len() - with_tool,
);
Ok(())
}
fn backfill_top_level(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let registry = TaxonomyRegistry::with_builtins();
let mut to_update: Vec<(i64, String)> = Vec::new();
{
let conn = db.connection();
let mut stmt = conn.prepare(
"SELECT id, subcategory FROM classifications WHERE top_level_category IS NULL",
)?;
let rows: Vec<(i64, Option<String>)> = stmt
.query_map([], |row| {
Ok((row.get::<_, i64>(0)?, row.get::<_, Option<String>>(1)?))
})?
.collect::<Result<_, _>>()?;
for (id, subcategory) in rows {
if let Some(sub) = subcategory {
if let Some(top) = registry.resolve(&sub) {
to_update.push((id, top.as_str_snake().to_string()));
}
}
}
}
if dry_run {
println!(
"Dry run — would update top_level_category for {} classification(s). \
No changes written.",
to_update.len(),
);
return Ok(());
}
let conn = db.connection_mut();
let tx = conn.transaction()?;
{
let mut up =
tx.prepare("UPDATE classifications SET top_level_category = ?1 WHERE id = ?2")?;
for (id, top) in &to_update {
up.execute(params![top, id])?;
}
}
tx.commit()?;
println!(
"Updated top_level_category for {} classification(s).",
to_update.len()
);
Ok(())
}
fn backfill_effort_tshirt(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
if dry_run {
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_commit_effort", [], |r| r.get(0))
.unwrap_or(0);
println!(
"Dry run — would rebin effort_tshirt (percentile) for {count} row(s) \
and persist corpus thresholds to effort_percentile_thresholds. \
No changes written."
);
return Ok(());
}
let (rows_updated, thresholds) =
tga::core::effort_percentile::rebin_all(db.connection_mut())
.map_err(|e| anyhow::anyhow!("percentile rebin failed: {e}"))?;
match thresholds {
Some(ref t) => {
println!(
"Rebinned effort_tshirt (percentile) for {rows_updated} row(s). \
Corpus thresholds persisted: p20={:.3} p40={:.3} p60={:.3} p80={:.3} \
(sample_count={}).",
t.p20, t.p40, t.p60, t.p80, t.sample_count,
);
}
None => {
println!(
"Rebinned effort_tshirt for {rows_updated} row(s) using \
static size-label mapping (corpus too small for percentile binning; \
run again after collecting more commits)."
);
}
}
Ok(())
}
fn backfill_quality(db: &mut Database, dry_run: bool) -> anyhow::Result<()> {
let config = tga::core::config::Config::default();
if dry_run {
let candidate: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM ( \
SELECT DISTINCT \
COALESCE(NULLIF(a.canonical_email, ''), c.author_email) AS ae, \
CAST(strftime('%Y', c.timestamp) AS INTEGER) AS yr, \
CAST(strftime('%W', c.timestamp) AS INTEGER) AS wk, \
c.repository \
FROM commits c \
LEFT JOIN authors a ON a.id = c.author_id \
)",
[],
|r| r.get(0),
)
.unwrap_or(0);
println!(
"Dry run — would write approximately {candidate} quality row(s) \
to fact_weekly_quality. No changes written."
);
return Ok(());
}
let data =
Aggregator::build(db, &config).map_err(|e| anyhow::anyhow!("aggregation failed: {e}"))?;
let written = Aggregator::persist_weekly_quality(db, &data)
.map_err(|e| anyhow::anyhow!("quality persist failed: {e}"))?;
println!("Backfilled fact_weekly_quality: {written} row(s) written (UPSERT semantics).");
if let Err(e) = db.wal_checkpoint(tga::core::db::CheckpointMode::Truncate) {
tracing::warn!(error = %e, "WAL TRUNCATE checkpoint failed after quality backfill");
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn seed(db: &Database, sha: &str, message: &str) {
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES (?1, 'n', 'e', '2024-01-01T00:00:00Z', ?2, 'r')",
params![sha, message],
)
.expect("insert");
}
#[test]
fn revert_detector_matches_expected_forms() {
assert!(is_revert("Revert \"feat: add login\""));
assert!(is_revert("revert: bad merge"));
assert!(is_revert("Revert this change"));
assert!(!is_revert("Refactor revert handling"));
assert!(!is_revert("Fix bug in feature"));
}
#[test]
fn ticket_id_extraction_prefers_specific_patterns() {
assert_eq!(
extract_ticket_id("AB#42 implement"),
Some("AB#42".to_string())
);
assert_eq!(
extract_ticket_id("ENG-123: feature"),
Some("ENG-123".to_string())
);
assert_eq!(extract_ticket_id("fixes #99"), Some("#99".to_string()));
assert_eq!(extract_ticket_id("misc cleanup"), None);
}
#[test]
fn backfill_revert_flags_updates_only_changed_rows() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "Revert \"foo\"");
seed(&db, "b", "feat: thing");
backfill_revert_flags(&mut db, false, &[], None, None).expect("backfill");
let reverts: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM commits WHERE is_revert = 1",
[],
|r| r.get(0),
)
.expect("q");
assert_eq!(reverts, 1);
}
#[test]
fn backfill_ticket_ids_populates_ticket_id() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "ENG-7: thing");
seed(&db, "b", "no ticket");
backfill_ticket_ids(&mut db, false, &[], None, None).expect("backfill");
let t: Option<String> = db
.connection()
.query_row("SELECT ticket_id FROM commits WHERE sha = 'a'", [], |r| {
r.get(0)
})
.expect("q");
assert_eq!(t, Some("ENG-7".to_string()));
let n: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM commits WHERE ticketed = 1", [], |r| {
r.get(0)
})
.expect("q");
assert_eq!(n, 1);
}
#[test]
fn dry_run_does_not_modify_rows() {
let mut db = Database::open_in_memory().expect("open");
seed(&db, "a", "Revert \"foo\"");
backfill_revert_flags(&mut db, true, &[], None, None).expect("dry run");
let reverts: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM commits WHERE is_revert = 1",
[],
|r| r.get(0),
)
.expect("q");
assert_eq!(reverts, 0);
}
#[tokio::test]
async fn backfill_complexity_dry_run_reports_candidates_without_writing() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO classifications (category, confidence, method, complexity) \
VALUES ('feature', 0.5, 'regex_rule', NULL)",
[],
)
.expect("insert null-complexity row");
db.connection()
.execute(
"INSERT INTO classifications (category, confidence, method, complexity) \
VALUES ('bugfix', 0.8, 'regex_rule', 3)",
[],
)
.expect("insert scored row");
let args = ComplexityBackfillArgs { use_llm: false };
backfill_complexity(Config::default(), &mut db, args, true)
.await
.expect("dry-run complexity backfill");
let null_count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM classifications WHERE complexity IS NULL",
[],
|r| r.get(0),
)
.expect("count null");
assert_eq!(null_count, 1, "dry-run must not write complexity scores");
}
#[test]
fn backfill_effort_persists_rows() {
let mut db = Database::open_in_memory().expect("open");
let rows = vec![EffortRow {
sha: "abc123".to_string(),
repository: "testrepo".to_string(),
size: "M".to_string(),
score: 9.1,
loc: 50,
files: 2,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
effort_tshirt: 3,
}];
persist_effort_rows(&mut db, &rows).expect("persist");
let (size, score, loc, files): (String, f64, i64, i64) = db
.connection()
.query_row(
"SELECT size, score, loc, files \
FROM fact_commit_effort WHERE sha = 'abc123'",
[],
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
)
.expect("query");
assert_eq!(size, "M");
assert!((score - 9.1).abs() < 0.001);
assert_eq!(loc, 50);
assert_eq!(files, 2);
}
#[test]
fn backfill_effort_force_recomputes() {
let mut db = Database::open_in_memory().expect("open");
let first = vec![EffortRow {
sha: "deadbeef".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 1.0,
loc: 1,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
effort_tshirt: 1,
}];
persist_effort_rows(&mut db, &first).expect("first persist");
let second = vec![EffortRow {
sha: "deadbeef".to_string(),
repository: "repo".to_string(),
size: "XL".to_string(),
score: 99.9,
loc: 100_000,
files: 500,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 2_000_000,
effort_tshirt: 5,
}];
persist_effort_rows(&mut db, &second).expect("second persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE sha = 'deadbeef'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 1, "UPSERT must not create duplicate rows");
let score: f64 = db
.connection()
.query_row(
"SELECT score FROM fact_commit_effort WHERE sha = 'deadbeef'",
[],
|r| r.get(0),
)
.expect("score");
assert!(
(score - 99.9).abs() < 0.001,
"score must be updated to 99.9"
);
}
#[test]
fn backfill_effort_same_sha_different_repos() {
let mut db = Database::open_in_memory().expect("open");
let rows = vec![
EffortRow {
sha: "cafebabe".to_string(),
repository: "repo-a".to_string(),
size: "S".to_string(),
score: 5.5,
loc: 30,
files: 2,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
effort_tshirt: 2, },
EffortRow {
sha: "cafebabe".to_string(),
repository: "repo-b".to_string(),
size: "M".to_string(),
score: 8.0,
loc: 60,
files: 3,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 1_000_000,
effort_tshirt: 3, },
];
persist_effort_rows(&mut db, &rows).expect("persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE sha = 'cafebabe'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 2, "same SHA in two repos must produce two rows");
}
#[test]
fn backfill_effort_empty_produces_no_rows() {
let mut db = Database::open_in_memory().expect("open");
persist_effort_rows(&mut db, &[]).expect("empty persist");
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_commit_effort", [], |r| r.get(0))
.expect("count");
assert_eq!(count, 0);
}
fn seed_commit_with_files(
db: &Database,
sha: &str,
repo: &str,
timestamp: &str,
files: &[(&str, u32, u32)], ) -> i64 {
let conn = db.connection();
conn.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES (?1, 'tester', 'test@example.com', ?2, 'msg', ?3)",
params![sha, timestamp, repo],
)
.expect("insert commit");
let commit_id = conn.last_insert_rowid();
for (path, ins, del) in files {
conn.execute(
"INSERT INTO files (commit_id, path, change_type, insertions, deletions) \
VALUES (?1, ?2, 'modified', ?3, ?4)",
params![commit_id, path, ins, del],
)
.expect("insert file");
}
commit_id
}
#[test]
fn backfill_effort_db_path_populates_fact_table() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"aaa111",
"myrepo",
"2024-01-01T00:00:00Z",
&[("src/main.rs", 30, 10), ("src/lib.rs", 5, 2)],
);
seed_commit_with_files(
&db,
"bbb222",
"myrepo",
"2024-01-02T00:00:00Z",
&[("src/tests/foo_test.rs", 20, 0)],
);
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "myrepo", &args, false).expect("db path");
assert_eq!(scored, 2, "both commits should be scored");
assert_eq!(skipped, 0, "nothing pre-scored");
persist_effort_rows(&mut db, &rows).expect("persist");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_commit_effort WHERE repository = 'myrepo'",
[],
|r| r.get(0),
)
.expect("count");
assert_eq!(count, 2, "two effort rows expected");
let (size_b, tests_factor_b): (String, f64) = db
.connection()
.query_row(
"SELECT size, tests_factor FROM fact_commit_effort WHERE sha = 'bbb222'",
[],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.expect("bbb222 row");
assert!(
(tests_factor_b - 0.7).abs() < 1e-6,
"expected tests_factor=0.7 for all-test commit, got {tests_factor_b}"
);
assert_eq!(size_b, "S", "all-test commit should be S");
}
#[test]
fn backfill_effort_db_path_skips_already_scored() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"scored111",
"repo",
"2024-01-01T00:00:00Z",
&[("src/a.rs", 10, 0)],
);
seed_commit_with_files(
&db,
"unscored222",
"repo",
"2024-01-02T00:00:00Z",
&[("src/b.rs", 5, 5)],
);
let pre = vec![EffortRow {
sha: "scored111".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 1.0,
loc: 10,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: FORMULA_VERSION.to_string(),
computed_at: 0,
effort_tshirt: 1, }];
persist_effort_rows(&mut db, &pre).expect("pre-persist");
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 1, "only unscored222 should be scored");
assert_eq!(skipped, 1, "scored111 should be skipped");
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].sha, "unscored222");
}
#[test]
fn backfill_effort_db_path_force_rescores_all() {
let mut db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"sha001",
"repo",
"2024-01-01T00:00:00Z",
&[("src/x.rs", 100, 50)],
);
let stale = vec![EffortRow {
sha: "sha001".to_string(),
repository: "repo".to_string(),
size: "XS".to_string(),
score: 0.1,
loc: 1,
files: 1,
test_loc: 0,
tests_factor: 1.0,
formula_version: "v0".to_string(),
computed_at: 0,
effort_tshirt: 1, }];
persist_effort_rows(&mut db, &stale).expect("stale persist");
let args = EffortBackfillArgs {
range: None,
force: true, notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 1, "force path should score the commit");
assert_eq!(skipped, 0, "nothing should be skipped with --force");
assert!(
rows[0].score > 1.0,
"re-scored effort should be higher than stale 0.1"
);
}
#[test]
fn backfill_effort_db_path_skips_commit_with_no_files() {
let db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, repository) \
VALUES ('empty001', 'tester', 'test@example.com', '2024-01-01T00:00:00Z', 'empty', 'repo')",
[],
)
.expect("insert commit");
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 0, "commit with no files should produce no records");
assert_eq!(skipped, 0);
assert!(rows.is_empty());
}
#[test]
fn backfill_effort_db_path_respects_limit() {
let db = Database::open_in_memory().expect("open");
for i in 0..5u32 {
seed_commit_with_files(
&db,
&format!("sha{i:03}"),
"repo",
&format!("2024-01-{:02}T00:00:00Z", i + 1),
&[("src/foo.rs", 10, 5)],
);
}
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: Some(3),
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, false).expect("db path");
assert_eq!(scored, 3, "limit=3 should cap at 3 records");
assert_eq!(rows.len(), 3);
}
#[test]
fn backfill_effort_db_path_scoped_to_repo() {
let db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"alpha001",
"repo-alpha",
"2024-01-01T00:00:00Z",
&[("src/a.rs", 20, 10)],
);
seed_commit_with_files(
&db,
"beta001",
"repo-beta",
"2024-01-01T00:00:00Z",
&[("src/b.rs", 50, 20)],
);
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo-alpha", &args, false).expect("db path");
assert_eq!(scored, 1);
assert_eq!(rows[0].sha, "alpha001");
assert_eq!(rows[0].repository, "repo-alpha");
}
#[test]
fn backfill_effort_db_path_dry_run_returns_rows_without_persisting() {
let db = Database::open_in_memory().expect("open");
seed_commit_with_files(
&db,
"drysha1",
"repo",
"2024-01-01T00:00:00Z",
&[("src/main.rs", 40, 10)],
);
let args = EffortBackfillArgs {
range: None,
force: false,
notes: false,
limit: None,
};
let (scored, _skipped, _sizes, rows) =
process_one_repo_db(db.connection(), "repo", &args, true )
.expect("db path");
assert_eq!(
scored, 1,
"db path should return 1 scored row even in dry_run"
);
assert_eq!(rows.len(), 1);
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_commit_effort", [], |r| r.get(0))
.expect("count");
assert_eq!(count, 0, "dry_run must not write to fact_commit_effort");
}
#[test]
fn backfill_ticketed_corrects_bare_hash_rows() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository, ticketed) VALUES ('bare1', 'n', 'e', '2024-01-01T00:00:00Z', \
'some note about #42', 'repo', 1)",
[],
)
.expect("insert bare-hash commit");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository, ticketed) VALUES ('jira1', 'n', 'e', '2024-01-02T00:00:00Z', \
'ENG-7: add feature', 'repo', 1)",
[],
)
.expect("insert JIRA commit");
seed(&db, "plain1", "no ticket here");
backfill_ticketed(&mut db, false, &[], None, None).expect("backfill ticketed");
let bare_val: i64 = db
.connection()
.query_row(
"SELECT ticketed FROM commits WHERE sha = 'bare1'",
[],
|r| r.get(0),
)
.expect("read bare");
assert_eq!(bare_val, 0, "bare #N must be unticketed after backfill");
let jira_val: i64 = db
.connection()
.query_row(
"SELECT ticketed FROM commits WHERE sha = 'jira1'",
[],
|r| r.get(0),
)
.expect("read jira");
assert_eq!(jira_val, 1, "JIRA ref must remain ticketed");
}
#[test]
fn backfill_ai_detection_commits_detects_claude() {
let mut db = Database::open_in_memory().expect("open");
let ai_msg = "feat: add auth\n\nCo-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>";
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository) VALUES ('ai1', 'n', 'e', '2024-01-01T00:00:00Z', ?1, 'repo')",
params![ai_msg],
)
.expect("insert AI commit");
seed(&db, "human1", "fix: bug without AI help");
backfill_ai_detection_commits(&mut db, false, &[], None, None)
.expect("backfill ai-detection");
let (is_ai, tool): (i64, Option<String>) = db
.connection()
.query_row(
"SELECT is_ai_assisted, ai_tool FROM commits WHERE sha = 'ai1'",
[],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.expect("read ai1");
assert_eq!(is_ai, 1, "AI-assisted commit must have is_ai_assisted=1");
assert_eq!(tool, Some("claude".to_string()), "ai_tool must be 'claude'");
let (human_ai, human_tool): (i64, Option<String>) = db
.connection()
.query_row(
"SELECT is_ai_assisted, ai_tool FROM commits WHERE sha = 'human1'",
[],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.expect("read human1");
assert_eq!(human_ai, 0, "human commit must have is_ai_assisted=0");
assert!(human_tool.is_none(), "human commit must have ai_tool=NULL");
}
#[test]
fn backfill_top_level_fills_known_subcategories() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO classifications (category, subcategory, confidence, method) \
VALUES ('bugfix', 'bugfix', 0.9, 'exact_rule')",
[],
)
.expect("insert classification");
backfill_top_level(&mut db, false).expect("backfill top-level");
let top: Option<String> = db
.connection()
.query_row(
"SELECT top_level_category FROM classifications WHERE subcategory = 'bugfix' \
ORDER BY id DESC LIMIT 1",
[],
|r| r.get(0),
)
.expect("read top");
assert_eq!(
top,
Some("bugfix".to_string()),
"bugfix subcategory must resolve to 'bugfix' top-level"
);
}
#[test]
fn backfill_effort_tshirt_fills_from_size() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO fact_commit_effort \
(sha, repository, size, score, loc, files, test_loc, tests_factor, \
formula_version, computed_at) \
VALUES ('tshirt_test', 'repo', 'L', 15.5, 200, 5, 0, 1.0, 'v1', 1000000)",
[],
)
.expect("insert effort row without tshirt");
backfill_effort_tshirt(&mut db, false).expect("backfill effort-tshirt");
let tshirt: Option<i64> = db
.connection()
.query_row(
"SELECT effort_tshirt FROM fact_commit_effort WHERE sha = 'tshirt_test'",
[],
|r| r.get(0),
)
.expect("read effort_tshirt");
assert_eq!(tshirt, Some(4), "L size must map to effort_tshirt=4");
}
#[test]
fn backfill_quality_populates_and_is_idempotent() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository, files_changed, insertions, deletions, is_merge, ticketed) \
VALUES ('q1', 'Alice', 'alice@example.com', '2024-01-15T10:00:00+00:00', \
'ENG-1 feature', 'repo-a', 1, 5, 1, 0, 1)",
[],
)
.expect("seed ticketed feature");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository, files_changed, insertions, deletions, is_merge, ticketed) \
VALUES ('q2', 'Alice', 'alice@example.com', '2024-01-16T10:00:00+00:00', \
'Revert \"ENG-1 feature\"', 'repo-a', 1, 2, 5, 0, 0)",
[],
)
.expect("seed revert");
backfill_quality(&mut db, false).expect("first backfill");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_weekly_quality WHERE author_email = 'alice@example.com'",
[],
|r| r.get(0),
)
.expect("count after first backfill");
assert_eq!(count, 1, "first backfill must write exactly 1 row");
backfill_quality(&mut db, false).expect("second backfill (idempotency)");
let count2: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM fact_weekly_quality WHERE author_email = 'alice@example.com'",
[],
|r| r.get(0),
)
.expect("count after second backfill");
assert_eq!(
count2, 1,
"second backfill must not add duplicate rows (UPSERT semantics)"
);
let score: f64 = db
.connection()
.query_row(
"SELECT quality_score FROM fact_weekly_quality WHERE author_email = 'alice@example.com'",
[],
|r| r.get(0),
)
.expect("read quality_score");
assert!(
(score - 0.700).abs() < 0.001,
"quality_score must be ~0.70 for this fixture, got {score:.6}"
);
}
#[test]
fn backfill_quality_dry_run_does_not_write() {
let mut db = Database::open_in_memory().expect("open");
db.connection()
.execute(
"INSERT INTO commits (sha, author_name, author_email, timestamp, message, \
repository, files_changed, insertions, deletions, is_merge) \
VALUES ('dq1', 'Bob', 'bob@example.com', '2024-02-05T10:00:00+00:00', \
'feat: x', 'repo-b', 1, 3, 1, 0)",
[],
)
.expect("seed commit");
backfill_quality(&mut db, true).expect("dry-run quality backfill must not error");
let count: i64 = db
.connection()
.query_row("SELECT COUNT(*) FROM fact_weekly_quality", [], |r| r.get(0))
.expect("count after dry-run");
assert_eq!(
count, 0,
"dry-run must not write any rows to fact_weekly_quality"
);
}
fn seed_effort_row_for_tshirt(db: &Database, sha: &str, repo: &str, score: f64, size: &str) {
db.connection()
.execute(
"INSERT OR REPLACE INTO fact_commit_effort \
(sha, repository, size, score, loc, files, test_loc, tests_factor, \
formula_version, computed_at, effort_tshirt) \
VALUES (?1, ?2, ?3, ?4, 10, 1, 0, 1.0, 'v1', 0, 0)",
params![sha, repo, size, score],
)
.expect("insert effort row");
}
#[test]
fn backfill_effort_tshirt_uses_percentile_binning() {
let mut db = Database::open_in_memory().expect("open");
for i in 1..=10u32 {
seed_effort_row_for_tshirt(&db, &format!("pct{i:03}"), "repo", i as f64, "M");
}
backfill_effort_tshirt(&mut db, false).expect("backfill");
let score1_band: i64 = db
.connection()
.query_row(
"SELECT effort_tshirt FROM fact_commit_effort WHERE sha = 'pct001'",
[],
|r| r.get(0),
)
.expect("band for score=1");
assert_eq!(score1_band, 1, "score=1 (below p20=2) → band 1");
let score10_band: i64 = db
.connection()
.query_row(
"SELECT effort_tshirt FROM fact_commit_effort WHERE sha = 'pct010'",
[],
|r| r.get(0),
)
.expect("band for score=10");
assert_eq!(score10_band, 5, "score=10 (above p80=8) → band 5");
let stored = tga::core::effort_percentile::load_thresholds(db.connection())
.expect("load thresholds")
.expect("must be Some after backfill of 10 rows");
assert!((stored.p20 - 2.0).abs() < 1e-9, "stored p20 must be 2.0");
assert!((stored.p80 - 8.0).abs() < 1e-9, "stored p80 must be 8.0");
}
#[test]
fn backfill_effort_tshirt_tiny_corpus_fallback() {
let mut db = Database::open_in_memory().expect("open");
for i in 1..=3u32 {
seed_effort_row_for_tshirt(&db, &format!("tiny{i}"), "repo", i as f64, "L");
}
backfill_effort_tshirt(&mut db, false).expect("backfill tiny corpus");
let tshirts: Vec<i64> = {
let conn = db.connection();
let mut stmt = conn
.prepare("SELECT effort_tshirt FROM fact_commit_effort")
.expect("prepare");
stmt.query_map([], |r| r.get(0))
.expect("query")
.map(|r| r.expect("row"))
.collect()
};
assert!(
tshirts.iter().all(|&v| v == 4),
"all rows must get L=4 (static fallback), got {tshirts:?}"
);
let stored = tga::core::effort_percentile::load_thresholds(db.connection()).expect("load");
assert!(
stored.is_none(),
"no thresholds should be stored for tiny corpus"
);
}
}