1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
//! Shared argument types and data structures for `tga backfill`.
//!
//! Why: clap arg structs and the `EffortRow` internal record type are used
//! across multiple submodules; centralising them avoids cyclic imports and keeps
//! each submodule focused on logic rather than type declarations.
use clap::{Args, Subcommand};
/// Arguments for `tga backfill`.
#[derive(Args, Debug)]
#[command(
about = "Retroactive maintenance operations on existing commit rows.",
long_about = "Re-run extraction or scoring steps on commits already in the database.\n\n\
These operations update existing rows in-place rather than ingesting new data.\n\
Each subcommand supports --dry-run to preview changes without writing.\n\n\
NOTE: --branch is collect-only. Commits in the DB do not carry branch\n\
attribution after the walk, so there is no branch filter on backfill operations.\n\
If you need to re-walk specific branches, use `tga collect --branch <name>`.\n\n\
TIPS:\n\
- Use --repos to limit scope to one service at a time on large corpora.\n\
- Use --since/--until or --weeks to limit the date window for fast iteration.",
after_help = "EXAMPLES:\n\
# Re-extract ticket IDs for all commits (after pattern change)\n\
tga backfill ticket-ids\n\n\
# Re-score effort for the last 4 weeks of one repo\n\
tga backfill effort --repos my-service --weeks 4 --force\n\n\
# Re-run reachability scan after adding release-branch patterns\n\
tga backfill reachability --repos core-api"
)]
pub struct BackfillArgs {
/// Backfill subcommand.
#[command(subcommand)]
pub subcommand: BackfillSubcommand,
/// Report what would change without writing.
#[arg(long, default_value_t = false, global = true)]
pub dry_run: bool,
/// Limit backfill to these repository names (comma-separated). [global]
///
/// Matches against the `repository` column in the `commits` table
/// (for ticket-ids, revert-flags) or the repo `name` in config
/// (for reachability, effort). When omitted, all repos are processed.
///
/// NOTE: not applicable to ai-detection (global LLM re-classification).
#[arg(long, value_delimiter = ',', global = true)]
pub repos: Vec<String>,
/// Limit backfill to commits in the last N ISO weeks. [global]
///
/// Restricts the set of commits processed by timestamp. Mutually exclusive
/// with --since/--until. Not applicable to reachability (uses config repos).
#[arg(long, value_name = "N", global = true, conflicts_with_all = ["since", "until"])]
pub weeks: Option<u32>,
/// Limit backfill to commits on or after this date (ISO8601: YYYY-MM-DD). [global]
///
/// Lower bound on the author timestamp. Mutually exclusive with --weeks.
#[arg(long, value_name = "DATE", global = true, conflicts_with = "weeks")]
pub since: Option<String>,
/// Limit backfill to commits on or before this date (ISO8601: YYYY-MM-DD). [global]
///
/// Upper bound on the author timestamp. Mutually exclusive with --weeks.
#[arg(long, value_name = "DATE", global = true, conflicts_with = "weeks")]
pub until: Option<String>,
}
/// `tga backfill` subcommands.
#[derive(Subcommand, Debug)]
pub enum BackfillSubcommand {
/// Re-run LLM classification on low-confidence prior LLM verdicts.
///
/// Clears `classification_id` on commits classified by the LLM tier
/// with confidence < 0.7, making them eligible for re-classification
/// on the next `tga classify` run. Use `tga classify --force` after
/// this to immediately re-process the cleared commits.
AiDetection,
/// Scan commit messages for revert patterns and update `is_revert`.
///
/// Detects `Revert "..."`, `revert:`, and `revert"` prefixes
/// (case-insensitive). Use --repos/--since/--until to limit scope.
RevertFlags,
/// Scan commit messages for ticket references and update `ticket_id`/`ticketed`.
///
/// Useful after extending ticket patterns or when collecting a new
/// repo whose commits were never run through ticket extraction.
/// --branch is collect-only and not applicable here.
TicketIds,
/// Re-run the tag/branch/default-branch reachability scan.
///
/// Upserts `fact_commit_reachability` rows without re-collecting commits.
/// Use this to fix `on_default_branch=0` rows in existing databases
/// without running the full 20-minute `tga collect` pipeline (issue #290).
///
/// Use --repos (via BackfillArgs) to limit to specific repositories.
/// --branch is collect-only; reachability is computed from the live git
/// repo graph, not from the branch the commits were originally collected on.
Reachability,
/// Compute empirical effort scores for historical commits.
///
/// Persists scores in `fact_commit_effort` using the v1 formula
/// (LoC + file count + tests factor, mapped to XS/S/M/L/XL).
///
/// Default path (db-only): reads from `commits JOIN files` — no on-disk
/// git repo required. Use --range or --notes to switch to the git path.
///
/// --branch is collect-only and not applicable here.
Effort(EffortBackfillArgs),
/// Fill in missing `complexity` scores (1–5) for already-classified commits.
///
/// The `complexity` column is only ever populated by the LLM tier, which
/// the normal `tga classify` run consults solely for low-confidence
/// commits. Commits resolved by rules or external sources (JIRA/GitHub)
/// therefore keep `complexity = NULL`. This subcommand asks the LLM for a
/// 1–5 complexity score for every classification with `complexity IS NULL`
/// and a non-`exact_rule` method, leaving category/confidence/method
/// untouched. Requires `use_llm: true` (or `--use-llm`) and an LLM API key.
///
/// Equivalent to `tga classify --backfill-complexity`; exposed here so the
/// operation is discoverable under `tga backfill` (issue #397, bug 2).
/// --repos/--since/--until/--weeks do not scope this operation: all NULL
/// rows are processed.
Complexity(ComplexityBackfillArgs),
/// Recompute `commits.ticketed` using the fixed regex rules (issue #445).
///
/// Bare `#N` refs no longer mark a commit as ticketed; only JIRA/Linear
/// (`PROJ-N`), GitHub action keywords (`closes/fixes/resolves #N`), and
/// Azure DevOps (`AB#N`) do. This subcommand re-evaluates every stored
/// `commits.message` with the corrected [`is_ticketed`] and updates rows
/// that differ from the stored value. No LLM required — pure regex.
///
/// Use --repos/--since/--until to limit scope on large databases.
Ticketed,
/// Scan existing `commits.message` for AI co-authorship trailers (issue #445).
///
/// Detects `Co-Authored-By:` trailers for Claude, GitHub Copilot, and
/// Cursor; sets `commits.is_ai_assisted` and `commits.ai_tool`.
/// No LLM required — pure string matching.
///
/// Use --repos/--since/--until to limit scope.
AiDetectionCommits,
/// Fill in `classifications.top_level_category` from existing subcategory
/// values using the built-in taxonomy (issue #445).
///
/// The top_level_category column was added in migration v17 and is
/// populated for new classifications at write time. This subcommand
/// retroactively fills existing rows by resolving each subcategory through
/// the taxonomy registry. No LLM required.
TopLevel,
/// Fill in `fact_commit_effort.effort_tshirt` from existing `size` values
/// (issue #445).
///
/// Maps the text size label (XS/S/M/L/XL) to the numeric T-shirt integer
/// (1–5) for existing rows that pre-date migration v17.
EffortTshirt,
/// Recompute and persist per-engineer-per-week quality scores to
/// `fact_weekly_quality` for all historical data (issue #445 batch B).
///
/// Reads the full `commits` table (left-joined against `classifications`
/// and `authors`), re-runs the aggregator's weekly bucketing logic, and
/// UPSERTs every (author, week, repo) grain into `fact_weekly_quality`.
/// Idempotent — running twice produces the same result. Supports --dry-run
/// to report the row count without writing.
///
/// No LLM required — quality scoring is a pure formula applied to
/// per-bucket counts of reverts, bugfixes, and ticketed commits.
Quality,
}
/// Arguments for `tga backfill complexity`.
#[derive(Args, Debug)]
pub struct ComplexityBackfillArgs {
/// Enable the LLM tier for this run even if `config.classification.use_llm`
/// is `false`.
///
/// Complexity scoring is LLM-only, so the LLM tier must be on. Pass this
/// flag (or set `use_llm: true` in config) along with an API key
/// (`OPENAI_API_KEY` / `OPENROUTER_API_KEY`).
#[arg(long, default_value_t = false)]
pub use_llm: bool,
}
/// Arguments for `tga backfill effort`.
#[derive(Args, Debug)]
pub struct EffortBackfillArgs {
/// Scope effort computation to a git commit range (e.g. `HEAD~10..HEAD`).
///
/// When omitted, all commits in the chosen repo(s) that do not already
/// have a `fact_commit_effort` row are processed (unless `--force`).
/// Requires a live on-disk git repository.
#[arg(long, value_name = "RANGE")]
pub range: Option<String>,
/// Recompute effort even if a row already exists (UPSERT semantics).
///
/// Without this flag, commits that already have a row in
/// `fact_commit_effort` are skipped. With `--force`, every commit is
/// re-scored and the existing row is replaced.
#[arg(long, default_value_t = false)]
pub force: bool,
/// Also write a git note to `refs/notes/effort` for each scored commit.
///
/// The note body is `Effort: <size>` (e.g. `Effort: M`), matching the
/// format the pre-commit hook injects into commit messages. Off by
/// default to keep the backfill lightweight. Requires a live git repo.
#[arg(long, default_value_t = false)]
pub notes: bool,
/// Maximum commits to process per repository.
///
/// Useful for smoke-testing on a large corpus. When omitted, all
/// eligible commits are processed.
#[arg(long, value_name = "N")]
pub limit: Option<usize>,
}
/// A single row to be written to `fact_commit_effort`.
///
/// Why: multiple effort-path functions (db path and git path) produce
/// homogeneous output that `persist_effort_rows` can batch-insert.
/// What: holds all columns for one `fact_commit_effort` row.
/// Test: constructed in `effort.rs` tests via `backfill_effort_persists_rows`.
pub(super) struct EffortRow {
pub sha: String,
pub repository: String,
pub size: String,
pub score: f64,
pub loc: u32,
pub files: u32,
pub test_loc: u32,
pub tests_factor: f64,
pub formula_version: String,
pub computed_at: i64,
/// Numeric T-shirt size (static label mapping): XS=1, S=2, M=3, L=4, XL=5.
///
/// Note: `persist_effort_rows` recomputes this from stored percentile thresholds
/// when available. This field is used as a fallback when no thresholds are stored.
#[allow(dead_code)]
pub effort_tshirt: i64,
}