tga 2.0.0

Developer productivity analytics — git commit collection, classification, and reporting
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
//! End-to-end Stage 1 collection pipeline.
//!
//! Orchestrates git extraction, identity resolution, and optional GitHub
//! and JIRA fetches against a configured [`crate::core::config::Config`].

use chrono::{DateTime, NaiveDate, TimeZone, Utc};
use tracing::{info, warn};

use crate::collect::azdo::AzureDevOpsClient;
use crate::collect::bitbucket::BitbucketClient;
use crate::collect::errors::Result;
use crate::collect::git::GitCollector;
use crate::collect::github::GitHubClient;
use crate::collect::identity::IdentityResolver;
use crate::collect::linear::LinearClient;
use crate::collect::pr_provider::PrProvider;
use crate::collect::weeks::{clamp_week_to_range, weeks_in_range};
use crate::core::config::Config;
use crate::core::db::{self, Database};
use crate::core::models::PullRequest;

/// Outcome of a `git fetch origin` attempt for a single repository.
///
/// Why: fetch errors are invisible unless the user reads tracing logs;
/// surfacing them in `CollectionStats` lets the CLI print an actionable
/// end-of-run summary.
/// What: three variants — Success (remote updated), Failed (network/auth error
/// recorded as a string), Skipped (no-fetch flag or no remote configured).
/// Test: covered by `commands::collect` integration tests.
#[derive(Debug, Clone)]
pub enum FetchOutcome {
    /// Remote was fetched successfully.
    Success {
        /// Name of the remote (usually `"origin"`).
        remote: String,
    },
    /// Fetch was attempted but failed.
    Failed {
        /// Name of the remote that was tried.
        remote: String,
        /// Human-readable error description.
        error: String,
    },
    /// Fetch was not attempted.
    Skipped {
        /// Reason the fetch was skipped (e.g. `"--no-fetch"` or `"no remote"`).
        reason: String,
    },
}

/// Per-repository fetch result, collected into [`CollectionStats::fetch_outcomes`].
///
/// Why: groups the display name of the repo with its fetch outcome so the
/// end-of-run summary can be printed without re-querying the git repo.
/// What: plain data carrier.
/// Test: covered by collection pipeline integration tests.
#[derive(Debug, Clone)]
pub struct PerRepoFetch {
    /// Display name of the repository (from config `name` or dir basename).
    pub repo: String,
    /// Outcome of the fetch attempt for this repo.
    pub outcome: FetchOutcome,
}

/// Aggregate statistics for a single pipeline run.
///
/// Why: callers (CLI, integration tests) need a single typed object
/// describing what the run did, both for stdout output and for asserting
/// expectations in tests.
/// What: counter struct populated by [`CollectionPipeline::run`]; the
/// `errors` vec accumulates per-repo non-fatal errors.
/// Test: covered by `tests::collect_integration_repo` (integration test
/// that runs the pipeline against a fixture repo).
#[derive(Debug, Clone, Default)]
pub struct CollectionStats {
    /// Number of new commit rows written across all repositories.
    pub commits_collected: usize,
    /// Number of distinct authors observed and upserted.
    pub authors_resolved: usize,
    /// Number of PR rows written (zero if GitHub fetch disabled).
    pub prs_fetched: usize,
    /// Number of Linear issues fetched (0 if Linear not configured).
    pub linear_issues_fetched: usize,
    /// Number of `(repo, week)` pairs that were collected this run.
    pub weeks_collected: usize,
    /// Number of `(repo, week)` pairs skipped because already present in
    /// `collection_runs` (and `force` was false).
    pub weeks_skipped: usize,
    /// Per-repo error messages encountered (non-fatal).
    pub errors: Vec<String>,
    /// Total `fact_commit_reachability` rows upserted across all repos.
    pub reachability_rows: usize,
    /// Per-repo fetch outcomes (one entry per repository attempted).
    ///
    /// Populated in the per-repo loop; used by the CLI to print the
    /// end-of-collect fetch summary.
    pub fetch_outcomes: Vec<PerRepoFetch>,
}

/// Top-level Stage 1 orchestrator.
///
/// Why: callers should not need to know the order of git extraction,
/// identity resolution, GitHub fetch, and JIRA fetch — the pipeline owns
/// the orchestration.
/// What: holds a validated [`Config`] plus boolean toggles for forced
/// re-collection, offline runs (`no_fetch`), and PR re-fetch
/// (`force_refresh_prs`). Constructed via [`Self::new`] + builder methods.
/// Test: covered by `tests::pipeline_constructs_with_default_config` and
/// the integration test `tests/integration_test.rs`.
pub struct CollectionPipeline {
    config: Config,
    force: bool,
    no_fetch: bool,
    force_refresh_prs: bool,
    /// When `true`, skip the tag and release-branch reachability scan
    /// (i.e. do not populate `fact_commit_reachability` with tag/branch data).
    skip_tag_reachability: bool,
    /// When `true`, seed every repository's revwalk from HEAD only (legacy
    /// 1.x behaviour).  When `false` (default since 2.0.0), all local branch
    /// heads and `refs/remotes/origin/*` refs are pushed so commits on
    /// non-default branches are not silently excluded.
    ///
    /// A per-repo `head_only: true` in `RepositoryConfig` provides the same
    /// opt-out for a single repository while keeping all-branch coverage for
    /// the rest.  The global flag here is OR-ed with the per-repo flag — if
    /// either is `true`, that repo walks HEAD only.
    head_only: bool,
    /// Explicit branch list for the `--branch` CLI filter.
    ///
    /// When non-empty, the revwalk is seeded from only these branch names
    /// (both `refs/heads/<name>` and `refs/remotes/origin/<name>` for each).
    /// Mutually exclusive with `head_only` — the CLI enforces this via
    /// `conflicts_with`.  An empty Vec means "no restriction" (the default).
    branches: Vec<String>,
    /// When `true`, exit non-zero after the collect summary if any repo had a
    /// fetch failure. Default `false` — failures are visible but non-fatal.
    strict_fetch: bool,
    /// When `true`, print a success line for every fetched repo in the summary
    /// (not just failures). Default `false` — only failures are printed.
    verbose_fetch: bool,
}

impl CollectionPipeline {
    /// Construct a new pipeline from a validated [`Config`].
    ///
    /// Why: pipelines start with toggles disabled by default; callers opt
    /// in to forced re-collection or PR refresh via builder methods.
    /// What: stores the config; sets `force = no_fetch = force_refresh_prs
    /// = false`.
    /// Test: covered by `tests::pipeline_constructs_with_default_config`.
    pub fn new(config: Config) -> Self {
        Self {
            config,
            force: false,
            no_fetch: false,
            force_refresh_prs: false,
            skip_tag_reachability: false,
            head_only: false,
            branches: Vec::new(),
            strict_fetch: false,
            verbose_fetch: false,
        }
    }

    /// Enable forced re-collection: every `(repo, ISO-week)` pair is
    /// collected regardless of whether `collection_runs` already has a row
    /// for it.
    pub fn with_force(mut self, force: bool) -> Self {
        self.force = force;
        self
    }

    /// If `true`, skip the pre-walk `git fetch origin` on each repository.
    ///
    /// Default is `false` (i.e. always fetch). Useful for offline runs or
    /// when the caller has already fetched.
    pub fn with_no_fetch(mut self, no_fetch: bool) -> Self {
        self.no_fetch = no_fetch;
        self
    }

    /// If `true`, skip the post-collection tag and release-branch reachability
    /// scan.
    ///
    /// When disabled, `fact_commit_reachability` rows for `on_any_tag`,
    /// `reachable_from_tags`, `on_release_branch`, and `release_branches` are
    /// not populated. Useful for trunk-based repos where no tags or release
    /// branches are used, or to reduce collection time on large repos with
    /// thousands of tags.
    pub fn with_skip_tag_reachability(mut self, skip: bool) -> Self {
        self.skip_tag_reachability = skip;
        self
    }

    /// Enable or disable the global HEAD-only revwalk escape hatch.
    ///
    /// Why: tga 2.0.0 changed the default to walk all local branches and remote
    /// tracking refs. This method lets the CLI `--head-only` flag propagate to
    /// every per-repo collector in the pipeline.  Per-repo `head_only: true` in
    /// `RepositoryConfig` provides the same opt-out for individual repos.
    /// What: when `true`, overrides all repos to seed from HEAD only; when
    /// `false` (the default), repos use their per-config `head_only` setting.
    /// Test: see `tests::head_only_legacy_behavior` in extractor.rs.
    pub fn with_head_only(mut self, head_only: bool) -> Self {
        self.head_only = head_only;
        self
    }

    /// Restrict the revwalk to an explicit list of branch names.
    ///
    /// Why: the `--branch <NAME[,NAME…]>` CLI flag lets callers scope collection
    /// to specific branches without touching the YAML config.  This is the
    /// pipeline-level counterpart that threads the list down to each
    /// `GitCollector`.
    /// What: when `branches` is non-empty, each `GitCollector` seeds the
    /// revwalk from `refs/heads/<name>` + `refs/remotes/origin/<name>` for
    /// every listed name, emitting a warning for names not found in a given
    /// repo.  An empty `branches` (the default) means "no restriction".
    /// Mutually exclusive with `head_only` — enforced at the CLI layer via
    /// `conflicts_with`.
    /// Test: see `tests::branch_filter_walks_only_named_branch` in
    /// `collect::git::extractor::tests`.
    pub fn with_branches(mut self, branches: Vec<String>) -> Self {
        self.branches = branches;
        self
    }

    /// When `true`, the pipeline returns a non-zero exit signal to the CLI if
    /// any repo had a fetch failure.
    ///
    /// Why: fetch failures are non-fatal by default (collection continues on
    /// local refs); `--strict-fetch` lets CI pipelines treat stale data as an
    /// error.
    /// What: sets the flag; the CLI checks
    /// [`CollectionStats::fetch_outcomes`] after `run()` and exits non-zero
    /// if any `Failed` variant is present and this flag is set.
    /// Test: the `commands::collect` handler reads this flag from `args`.
    pub fn with_strict_fetch(mut self, strict: bool) -> Self {
        self.strict_fetch = strict;
        self
    }

    /// When `true`, print a success line per fetched repo in the end-of-run
    /// summary (default: only failures are shown).
    ///
    /// Why: the default summary hides successful fetches to keep output brief;
    /// `--verbose-fetch` is useful when debugging network topology.
    /// What: sets the flag; the CLI uses it when printing the fetch summary.
    /// Test: the `commands::collect` handler reads this flag from `args`.
    pub fn with_verbose_fetch(mut self, verbose: bool) -> Self {
        self.verbose_fetch = verbose;
        self
    }

    /// Returns whether `--strict-fetch` was set.
    pub fn strict_fetch(&self) -> bool {
        self.strict_fetch
    }

    /// Returns whether `--verbose-fetch` was set.
    pub fn verbose_fetch(&self) -> bool {
        self.verbose_fetch
    }

    /// If `true`, re-fetch Azure DevOps pull requests even when their IDs are
    /// already present in `pull_requests`.
    ///
    /// This bypasses the [`crate::collect::azdo::get_existing_pr_numbers`]
    /// deduplication cache for the ADO provider, so stale rows persisted
    /// before v1.0.9 (with `commit_shas = '[]'`) are re-fetched and
    /// re-upserted with the correct merge SHA. Default is `false`.
    pub fn with_force_refresh_prs(mut self, force_refresh_prs: bool) -> Self {
        self.force_refresh_prs = force_refresh_prs;
        self
    }

    /// Borrow the underlying configuration.
    pub fn config(&self) -> &Config {
        &self.config
    }

    /// Run the full collection sequence against `db`.
    ///
    /// Each repository is processed sequentially; per-repo failures are
    /// recorded in [`CollectionStats::errors`] but do not abort the run.
    ///
    /// # Errors
    ///
    /// Returns a non-recoverable [`crate::collect::CollectError`] only for
    /// failures outside the per-repo loop.
    pub async fn run(&self, db: &mut Database) -> Result<CollectionStats> {
        let mut stats = CollectionStats::default();

        let resolver = IdentityResolver::from_config(&self.config);

        for repo_cfg in &self.config.repositories {
            // Per-repo head_only is OR-ed with the global pipeline flag: if
            // either is true, that repo walks HEAD only.  This lets operators
            // set `--head-only` globally (the CLI flag) or `head_only: true`
            // per repo in YAML without requiring both to be set.
            let effective_head_only = self.head_only || repo_cfg.head_only;
            // Build a pre-fetch collector (with no_fetch = self.no_fetch) solely
            // to run perform_fetch once and capture the outcome. Then build the
            // walk collector with no_fetch=true so the per-week collect_window
            // calls don't re-fetch.
            let pre_fetch_collector = match GitCollector::new(repo_cfg) {
                Ok(c) => c
                    .no_fetch(self.no_fetch)
                    .with_head_only(effective_head_only)
                    .with_explicit_branches(self.branches.clone()),
                Err(e) => {
                    let msg = format!("failed to open repo {}: {e}", repo_cfg.path.display());
                    warn!("{msg}");
                    stats.errors.push(msg);
                    continue;
                }
            };
            // Perform the one-shot fetch and record the outcome (#334).
            let fetch_result = pre_fetch_collector.perform_fetch();
            stats.fetch_outcomes.push(fetch_result);

            // Walk collector always has no_fetch=true: the fetch was either just
            // performed above, or was intentionally skipped (--no-fetch).
            let collector = match GitCollector::new(repo_cfg) {
                Ok(c) => c
                    .no_fetch(true)
                    .with_head_only(effective_head_only)
                    .with_explicit_branches(self.branches.clone()),
                Err(e) => {
                    let msg = format!("failed to open repo {}: {e}", repo_cfg.path.display());
                    warn!("{msg}");
                    stats.errors.push(msg);
                    continue;
                }
            };
            self.collect_repo_by_week(db, &collector, &mut stats);
        }

        // Tag and release-branch reachability scan (issue #279).
        // Run once after all per-repo git walks, before PR fetches, because the
        // reachability data is derived purely from the local git graph.
        if !self.skip_tag_reachability {
            self.run_reachability_scan(db, &mut stats);
        } else {
            info!("skipping tag/release-branch reachability scan (--skip-tag-reachability)");
        }

        // Backfill authors from observed commits.
        stats.authors_resolved = self.upsert_observed_authors(db, &resolver)?;

        // Issue #68: any commit with NULL author_id after identity resolution
        // is "phantom" — it would be counted as a distinct developer in
        // reports. Surface the count so the operator can extend the alias map.
        if let Ok(unresolved) = count_unresolved_commits(db) {
            if unresolved > 0 {
                let msg = format!(
                    "WARNING: {unresolved} commits have unresolved author identities and may \
                     inflate developer counts. Run `tga aliases list` to review, or extend \
                     `developer_aliases` in the config to map missing identities."
                );
                warn!("{msg}");
                eprintln!("{msg}");
            }
        }

        // PR providers (GitHub, Bitbucket, …) run concurrently. Each
        // provider fetches on its own task, then we persist the results
        // sequentially on the main task because `Database` is not `Sync`.
        self.fetch_and_store_prs(db, &mut stats).await;

        // Optional: Azure DevOps connection probe + work-item enrichment.
        if let Some(azdo_cfg) = self.config.azure_devops_config() {
            let client = AzureDevOpsClient::new(azdo_cfg.clone());
            match client.test_connection().await {
                Ok(info) => info!(
                    user = info.user_name.as_deref().unwrap_or("?"),
                    org = %info.organization_url,
                    "Azure DevOps connection verified",
                ),
                Err(e) => {
                    warn!("Azure DevOps connection failed (non-fatal): {e}");
                }
            }
            if azdo_cfg.fetch_on_reference {
                if let Err(e) = self
                    .fetch_and_persist_azdo_work_items(db, &client, azdo_cfg)
                    .await
                {
                    stats
                        .errors
                        .push(format!("ADO work item persistence failed: {e}"));
                }
            }
            if azdo_cfg.fetch_prs {
                match self.fetch_and_persist_azdo_prs(db, azdo_cfg).await {
                    Ok(n) => {
                        info!(prs = n, "stored ADO pull requests");
                        stats.prs_fetched += n;
                    }
                    Err(e) => {
                        stats.errors.push(format!("ADO PR fetch failed: {e}"));
                    }
                }
            }
        }

        // Optional: Linear issue enrichment.
        if let Some(linear_cfg) = &self.config.linear {
            if linear_cfg.fetch_on_reference {
                match LinearClient::new(linear_cfg) {
                    Ok(client) => {
                        // Collect commit messages from DB.
                        let messages: Vec<String> = {
                            let conn = db.connection();
                            let mut stmt = match conn.prepare("SELECT message FROM commits") {
                                Ok(s) => s,
                                Err(e) => {
                                    stats
                                        .errors
                                        .push(format!("Linear: query commits failed: {e}"));
                                    return Ok(stats);
                                }
                            };
                            let rows = match stmt.query_map([], |row| row.get::<_, String>(0)) {
                                Ok(r) => r,
                                Err(e) => {
                                    stats
                                        .errors
                                        .push(format!("Linear: read commits failed: {e}"));
                                    return Ok(stats);
                                }
                            };
                            let mut out = Vec::new();
                            for r in rows.flatten() {
                                out.push(r);
                            }
                            out
                        };

                        let msg_refs: Vec<&str> = messages.iter().map(String::as_str).collect();
                        let issues = client
                            .fetch_referenced_issues(&msg_refs, &linear_cfg.team_keys)
                            .await;
                        for issue in &issues {
                            info!(
                                id = %issue.identifier,
                                state = %issue.state,
                                team = %issue.team,
                                "Linear issue fetched"
                            );
                        }
                        match client.store_issues(db, &issues) {
                            Ok(n) => {
                                info!(stored = n, "persisted linear_issues rows");
                                stats.linear_issues_fetched += n;
                            }
                            Err(e) => {
                                stats
                                    .errors
                                    .push(format!("Linear: store issues failed: {e}"));
                            }
                        }
                    }
                    Err(e) => {
                        stats.errors.push(format!("Linear client init failed: {e}"));
                    }
                }
            }
        }

        Ok(stats)
    }

    /// Run the tag and release-branch reachability scan for every configured
    /// repository and accumulate the results into `stats`.
    ///
    /// Why: after commits are stored, we can walk the git graph once per repo to
    /// build the tag/branch ancestry maps and write `fact_commit_reachability`
    /// rows.  Non-fatal — errors are pushed into `stats.errors` so one broken
    /// repo (e.g. a bare clone without tags) does not abort the full run.
    /// What: iterates `self.config.repositories`, resolves each path, calls
    /// [`crate::collect::git::reachability::scan_and_persist`], and accumulates
    /// `rows_upserted` into `stats.reachability_rows`.
    /// Test: covered by the integration test in `reachability::tests`.
    fn run_reachability_scan(&self, db: &mut Database, stats: &mut CollectionStats) {
        use crate::collect::git::reachability::scan_and_persist;
        use crate::core::config::expand_path;

        let cfg = &self.config.reachability;

        if !cfg.track_tags && !cfg.track_release_branches {
            info!("reachability tracking disabled by config (track_tags=false, track_release_branches=false)");
            return;
        }

        let conn = db.connection();
        for repo_cfg in &self.config.repositories {
            let path = expand_path(&repo_cfg.path);
            let name = repo_cfg
                .name
                .clone()
                .or_else(|| {
                    path.file_name()
                        .and_then(|s| s.to_str())
                        .map(|s| s.to_string())
                })
                .unwrap_or_else(|| path.display().to_string());

            info!(repo = %name, "running reachability scan");
            match scan_and_persist(&path, conn, cfg, Some(&name)) {
                Ok(r) => {
                    info!(
                        repo = %name,
                        rows = r.rows_upserted,
                        default_branch = r.default_branch_commits,
                        tagged = r.tagged_commits,
                        release_branch = r.release_branch_commits,
                        "reachability scan complete"
                    );
                    stats.reachability_rows += r.rows_upserted;
                }
                Err(e) => {
                    let msg = format!("reachability scan failed for {name}: {e}");
                    warn!("{msg}");
                    stats.errors.push(msg);
                }
            }
        }
    }

    /// Build the set of [`PrProvider`] instances enabled by the current
    /// configuration. Each provider's construction is independent — a failure
    /// is logged on `stats.errors` but does not abort the run.
    fn build_pr_providers(
        &self,
        stats: &mut CollectionStats,
    ) -> Vec<Box<dyn PrProvider + Send + Sync>> {
        let mut providers: Vec<Box<dyn PrProvider + Send + Sync>> = Vec::new();

        if let Some(gh_cfg) = &self.config.github {
            if gh_cfg.fetch_prs {
                // Multi-repo resolution (#87): drive from `repositories[]` or
                // `github.org` when `github.repo` is not set. If nothing
                // resolves, skip GitHub PR fetching gracefully.
                let repos = crate::collect::github::client::resolve_github_repos(
                    gh_cfg,
                    &self.config.repositories,
                );
                if repos.is_empty() {
                    info!(
                        "GitHub PR fetch skipped: no github.repo, no per-repo org, \
                         and no github.org resolvable from repositories[]"
                    );
                } else if gh_cfg.token.is_none() && std::env::var("GITHUB_TOKEN").ok().is_none() {
                    // Issue #211: surface the token misconfiguration loudly.
                    // Without a PAT, GitHub limits anonymous traffic to 60
                    // requests/hour, which silently truncates org-wide PR
                    // pulls and is the #1 reason `pull_requests` ends up
                    // empty after a `tga collect` run.
                    let msg = "GitHub PR fetch is enabled (github.fetch_prs=true) but \
                               no token is configured. Set `github.token` or the \
                               GITHUB_TOKEN env var to a PAT with `repo` scope (public \
                               repos only need `public_repo`); without it, GitHub \
                               rate-limits to 60 requests/hour and most PRs will be \
                               missed.";
                    warn!("{msg}");
                    eprintln!("warning: {msg}");
                    info!(
                        repo_count = repos.len(),
                        "GitHub PR fetcher will scan {} repo(s) anonymously",
                        repos.len()
                    );
                    match GitHubClient::new_for_prs(gh_cfg, repos) {
                        Ok(gh) => providers.push(Box::new(gh)),
                        Err(e) => stats.errors.push(format!("GitHub client init failed: {e}")),
                    }
                } else {
                    info!(
                        repo_count = repos.len(),
                        "GitHub PR fetcher will scan {} repo(s)",
                        repos.len()
                    );
                    match GitHubClient::new_for_prs(gh_cfg, repos) {
                        Ok(gh) => providers.push(Box::new(gh)),
                        Err(e) => stats.errors.push(format!("GitHub client init failed: {e}")),
                    }
                }
            } else {
                // Issue #211: when the github config block exists but
                // fetch_prs is false (the default), the pull_requests table
                // ends up empty even though the user has clearly opted into
                // GitHub integration. Emit a one-shot diagnostic so the
                // operator can find the toggle without grepping the source.
                info!(
                    "GitHub PR fetch disabled (github.fetch_prs=false). Set \
                     `github.fetch_prs: true` in your config to populate the \
                     pull_requests table."
                );
            }
        } else if has_github_like_repos(&self.config.repositories) {
            // Issue #211: zero `pull_requests` rows is the single most
            // common "tga seems broken" question. Detect the most likely
            // misconfiguration (repos look like GitHub clones but no
            // `github:` block in the config) and tell the operator how to
            // fix it before they go hunting through the code.
            let msg = "Repositories look like GitHub clones, but no `github:` config \
                       block is present. To populate the `pull_requests` table, add:\n\
                       \n\
                       github:\n  \
                         token: \"${GITHUB_TOKEN}\"   # PAT with `repo` scope\n  \
                         fetch_prs: true\n  \
                         repo: \"owner/name\"         # OR `org: \"owner\"` for org-wide\n";
            tracing::info!("{msg}");
        }
        if let Some(bb_cfg) = &self.config.bitbucket {
            if bb_cfg.fetch_prs {
                match BitbucketClient::new(bb_cfg) {
                    Ok(bb) => providers.push(Box::new(bb)),
                    Err(e) => stats
                        .errors
                        .push(format!("Bitbucket client init failed: {e}")),
                }
            }
        }
        providers
    }

    /// Run every configured PR provider concurrently, then persist their
    /// results on the main task.
    ///
    /// We spawn one task per provider so a slow remote (or the second
    /// provider being absent) doesn't gate the others. Each task returns the
    /// fetched `Vec<PullRequest>` so the `Database` — which is not `Sync` —
    /// is only ever touched by the orchestrator.
    async fn fetch_and_store_prs(&self, db: &mut Database, stats: &mut CollectionStats) {
        let providers = self.build_pr_providers(stats);
        if providers.is_empty() {
            return;
        }

        let mut set: tokio::task::JoinSet<(String, Result<Vec<PullRequest>>)> =
            tokio::task::JoinSet::new();
        // Keep providers alive in an Arc so the spawned task can return its
        // name and the orchestrator can still call `store_pull_requests`.
        let providers: Vec<std::sync::Arc<dyn PrProvider + Send + Sync>> =
            providers.into_iter().map(std::sync::Arc::from).collect();

        for p in &providers {
            let p = std::sync::Arc::clone(p);
            let name = p.name().to_string();
            set.spawn(async move {
                let result = p.fetch_pull_requests().await;
                (name, result)
            });
        }

        // Drain results as they complete. Persistence runs on the main task
        // (where `&mut Database` is safe to use) and uses the matching
        // provider's `store_pull_requests`.
        while let Some(joined) = set.join_next().await {
            let (provider_name, fetch_result) = match joined {
                Ok(t) => t,
                Err(e) => {
                    stats.errors.push(format!("PR fetch task panicked: {e}"));
                    continue;
                }
            };
            match fetch_result {
                Ok(prs) => {
                    // Find the matching provider for storage.
                    let Some(provider) = providers.iter().find(|p| p.name() == provider_name)
                    else {
                        stats.errors.push(format!(
                            "internal: no provider registered for '{provider_name}' \
                             when storing PRs"
                        ));
                        continue;
                    };
                    match provider.store_pull_requests(db, &prs) {
                        Ok(n) => {
                            info!(provider = %provider_name, prs = n, "stored pull requests");
                            stats.prs_fetched += n;
                        }
                        Err(e) => {
                            stats
                                .errors
                                .push(format!("{provider_name} PR store failed: {e}"));
                        }
                    }
                }
                Err(e) => {
                    stats
                        .errors
                        .push(format!("{provider_name} PR fetch failed: {e}"));
                }
            }
        }
    }

    /// Collect a single repository week-by-week, skipping `(repo, ISO-week)`
    /// pairs that already have a row in `collection_runs` unless `force` is
    /// set. All non-fatal errors are pushed into `stats.errors` so that one
    /// bad week (or bad repo) does not abort the entire run.
    fn collect_repo_by_week(
        &self,
        db: &mut Database,
        collector: &GitCollector,
        stats: &mut CollectionStats,
    ) {
        let repo_name = collector.name().to_string();

        // Derive the [from, to] NaiveDate window from the collector's
        // configured since/until. The week-level skip mechanism (the
        // `collection_runs` table) is the only reason re-running on a
        // 58K-commit repo is tolerable, so we want to take the bounded
        // path whenever AT LEAST a `since` bound is available, defaulting
        // `to` to "today" when `until` is absent (the common case for
        // --weeks / --from).
        //
        // The fully-unbounded path (no `since` at all) is dangerous on
        // large monorepos: full-history traversal + no week bookkeeping
        // means a re-run repeats the entire walk. We keep it for
        // backwards compatibility but warn loudly.
        let (from, to) = match (collector.since(), collector.until()) {
            (Some(s), Some(u)) => (s.date_naive(), u.date_naive()),
            (Some(s), None) => (s.date_naive(), Utc::now().date_naive()),
            (None, Some(u)) => {
                // Unusual: `until` without `since`. Treat the window as
                // open-ended on the lower side and walk full history up
                // to `until` — emit the same warning as the fully
                // unbounded case so the user knows.
                warn!(
                    repo = %repo_name,
                    "until_date set without since_date — collecting full git history. \
                     Use --weeks N or set analysis.since_date in config to limit scope."
                );
                eprintln!(
                    "warning: [{repo_name}] no since_date / --weeks — collecting FULL git history. \
                     Set analysis.since_date or pass --weeks N to limit scope."
                );
                match collector.collect_window(db, None, Some(u)) {
                    Ok(n) => {
                        info!(repo = %repo_name, commits = n, "extracted (until-only)");
                        stats.commits_collected += n;
                    }
                    Err(e) => {
                        let msg = format!("collection failed for {repo_name}: {e}");
                        warn!("{msg}");
                        stats.errors.push(msg);
                    }
                }
                return;
            }
            (None, None) => {
                // Fully unbounded — full history traversal with no week
                // bookkeeping. Warn explicitly per Bug #65.
                warn!(
                    repo = %repo_name,
                    "no since_date or --weeks flag set — collecting full git history. \
                     Use --weeks N or set analysis.since_date in config to limit scope."
                );
                eprintln!(
                    "warning: [{repo_name}] no since_date / --weeks — collecting FULL git history. \
                     Set analysis.since_date or pass --weeks N to limit scope."
                );
                match collector.collect(db) {
                    Ok(n) => {
                        info!(repo = %repo_name, commits = n, "extracted (unbounded)");
                        stats.commits_collected += n;
                    }
                    Err(e) => {
                        let msg = format!("collection failed for {repo_name}: {e}");
                        warn!("{msg}");
                        stats.errors.push(msg);
                    }
                }
                return;
            }
        };

        for week in weeks_in_range(from, to) {
            let (year, week_no, _, _) = week;
            // Skip-if-collected check.
            if !self.force {
                match db::is_week_collected(db, &repo_name, year, week_no) {
                    Ok(true) => {
                        info!("Skipping {repo_name} W{week_no} {year} — already collected");
                        println!(
                            "Skipped   W{week_no:02} {year}: already collected \
                             (use --force to re-collect) [{repo_name}]"
                        );
                        stats.weeks_skipped += 1;
                        continue;
                    }
                    Ok(false) => {}
                    Err(e) => {
                        let msg = format!(
                            "collection_runs lookup failed for {repo_name} W{week_no} {year}: {e}"
                        );
                        warn!("{msg}");
                        stats.errors.push(msg);
                        continue;
                    }
                }
            }

            // Clamp the week to the user-requested range so we don't pull
            // commits outside [from, to] on partial-week boundaries.
            let (win_start, win_end) = clamp_week_to_range(week, from, to);
            let since_ts = naive_date_start_utc(win_start);
            let until_ts = naive_date_end_utc(win_end);

            match collector.collect_window(db, Some(since_ts), Some(until_ts)) {
                Ok(n) => {
                    info!(
                        repo = %repo_name,
                        year,
                        week = week_no,
                        commits = n,
                        "extracted week"
                    );
                    println!("Collected W{week_no:02} {year}: {n} commits [{repo_name}]");
                    stats.commits_collected += n;
                    stats.weeks_collected += 1;
                    let repo_count = self.config.repositories.len();
                    if let Err(e) =
                        db::record_collection_run(db, &repo_name, year, week_no, n, repo_count)
                    {
                        let msg = format!(
                            "failed to record collection_run for {repo_name} W{week_no} {year}: {e}"
                        );
                        warn!("{msg}");
                        stats.errors.push(msg);
                    }
                }
                Err(e) => {
                    let msg = format!("collection failed for {repo_name} W{week_no} {year}: {e}");
                    warn!("{msg}");
                    stats.errors.push(msg);
                }
            }
        }
    }

    /// Read distinct `(author_name, author_email)` pairs from `commits`
    /// and upsert them via the resolver, then link `commits.author_id`.
    fn upsert_observed_authors(
        &self,
        db: &mut Database,
        resolver: &IdentityResolver,
    ) -> Result<usize> {
        // Collect distinct pairs first to avoid holding a Statement across
        // mutating calls.
        let pairs: Vec<(String, String)> = {
            let conn = db.connection();
            let mut stmt = conn.prepare(
                "SELECT DISTINCT author_name, author_email FROM commits WHERE author_id IS NULL",
            )?;
            let rows = stmt.query_map([], |row| {
                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
            })?;
            let mut out = Vec::new();
            for r in rows {
                out.push(r?);
            }
            out
        };

        let mut count = 0usize;
        for (name, email) in pairs {
            let author_id = resolver.upsert_author(db, &name, &email)?;
            db.connection().execute(
                "UPDATE commits SET author_id = ?1 \
                 WHERE author_id IS NULL AND author_name = ?2 AND author_email = ?3",
                rusqlite::params![author_id, name, email],
            )?;
            count += 1;
        }
        Ok(count)
    }

    /// Scan stored commit messages for `AB#N` references, batch-fetch the
    /// referenced ADO work items, and persist them in `work_items` and
    /// `commit_work_items`.
    ///
    /// The pipeline pulls `(sha, message)` from `commits`, computes the unique
    /// set of referenced IDs, calls
    /// [`AzureDevOpsClient::get_work_items`] in batches of up to 200, then
    /// upserts the resulting rows and inserts join-table links.
    ///
    /// All work-item linking is done in a single transaction so that a partial
    /// failure doesn't leave dangling rows.
    ///
    /// # Errors
    ///
    /// Returns a [`crate::collect::CollectError`] if reading commits, calling
    /// ADO, or writing to SQLite fails.
    async fn fetch_and_persist_azdo_work_items(
        &self,
        db: &mut Database,
        client: &AzureDevOpsClient,
        azdo_cfg: &crate::core::config::AzureDevOpsConfig,
    ) -> Result<()> {
        use crate::collect::azdo::extract_work_item_refs;
        use std::collections::{BTreeSet, HashMap};

        // The ticket_regex pattern is validated at config load
        // (`Config::validate_ticket_regexes`), so compilation here cannot fail
        // under normal flow. We still propagate the error rather than panic
        // to keep the no-`unwrap()` invariant in library code from CLAUDE.md.
        let ticket_re = regex::Regex::new(&azdo_cfg.ticket_regex).map_err(|e| {
            crate::collect::CollectError::Config(format!(
                "pm.azure_devops.ticket_regex {:?} failed to compile: {e}",
                azdo_cfg.ticket_regex
            ))
        })?;

        // 1. Pull (sha, message) pairs from the database.
        let rows: Vec<(String, String)> = {
            let conn = db.connection();
            let mut stmt = conn.prepare("SELECT sha, message FROM commits")?;
            let mapped = stmt.query_map([], |row| {
                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
            })?;
            let mut out = Vec::new();
            for r in mapped {
                out.push(r?);
            }
            out
        };

        // 2. Build commit -> [work_item_id] map and the unique ID set.
        let mut commit_refs: HashMap<String, Vec<u32>> = HashMap::new();
        let mut all_ids: BTreeSet<u32> = BTreeSet::new();
        for (sha, msg) in &rows {
            let ids = extract_work_item_refs(&ticket_re, msg);
            if !ids.is_empty() {
                for id in &ids {
                    all_ids.insert(*id);
                }
                commit_refs.insert(sha.clone(), ids);
            }
        }

        if all_ids.is_empty() {
            info!(
                pattern = %azdo_cfg.ticket_regex,
                "No work-item references found in commit messages; skipping ADO work item fetch",
            );
            return Ok(());
        }

        // 3. Batch-fetch the referenced work items.
        let ids: Vec<u32> = all_ids.iter().copied().collect();
        let items = match client.get_work_items(&ids).await {
            Ok(v) => v,
            Err(e) => {
                warn!("ADO get_work_items failed: {e}");
                return Ok(());
            }
        };
        info!(
            fetched = items.len(),
            commits = commit_refs.len(),
            "Fetched ADO work items for commits",
        );

        // 4. Persist work items and commit links in a single transaction.
        let tx = db.connection_mut().transaction()?;
        let fetched_ids: std::collections::HashSet<u32> = items.iter().map(|w| w.id).collect();
        for w in &items {
            let raw_json = serde_json::to_string(w).ok();
            let tags_csv = if w.tags.is_empty() {
                None
            } else {
                Some(w.tags.join(","))
            };
            let row = crate::core::db::WorkItemRow {
                id: w.id.to_string(),
                source: "azdo".to_string(),
                title: w.title.clone(),
                status: w.state.clone(),
                item_type: w.work_item_type.clone(),
                tags: tags_csv,
                project: Some(w.team_project.clone()),
                url: w.url.clone(),
                raw_json,
            };
            crate::core::db::work_items::upsert_work_item(&tx, &row)?;
        }
        for (sha, ref_ids) in &commit_refs {
            for id in ref_ids {
                // Skip refs that ADO didn't return (deleted, scope-restricted)
                // to avoid FK violations on the join table.
                if !fetched_ids.contains(id) {
                    continue;
                }
                crate::core::db::work_items::link_commit_work_item(
                    &tx,
                    sha,
                    &id.to_string(),
                    "azdo",
                )?;
            }
        }
        tx.commit()?;
        Ok(())
    }

    /// Fetch ADO pull requests referenced by commit-message `Merged PR NNNN:`
    /// patterns and persist them (with reviewers) under provider `'azdo'`.
    ///
    /// Why: ADO PRs are the source of review-pattern signals (vote
    /// distribution, reviewer load) that are absent from the bare git history;
    /// they live in the same `pull_requests` table as GitHub PRs but
    /// scoped by the `provider` column.
    /// What: pulls commit messages, extracts PR IDs, fetches each PR serially
    /// from `GET {org}/{project}/_apis/git/pullrequests/{id}`, and upserts
    /// rows into `pull_requests` + `pr_reviewers`. When `force_refresh_prs`
    /// is set, the PR-ID deduplication cache is bypassed so stale rows are
    /// re-fetched.
    /// Test: PR-ID extraction, DB CRUD, and config wiring are covered in
    /// `azdo::pr_fetcher::tests`. The full path is exercised by integration
    /// tests gated on a live ADO instance.
    ///
    /// # Errors
    ///
    /// Returns a [`crate::collect::CollectError`] for SQL failures. HTTP
    /// failures on individual PRs are logged and do not abort the run.
    async fn fetch_and_persist_azdo_prs(
        &self,
        db: &mut Database,
        azdo_cfg: &crate::core::config::AzureDevOpsConfig,
    ) -> Result<usize> {
        use crate::collect::azdo::AdoPrFetcher;

        let messages: Vec<String> = {
            let conn = db.connection();
            let mut stmt = conn.prepare("SELECT message FROM commits")?;
            let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
            let mut out = Vec::new();
            for r in rows {
                out.push(r?);
            }
            out
        };

        let fetcher = match AdoPrFetcher::new(azdo_cfg.clone()) {
            Ok(f) => f,
            Err(e) => {
                warn!("ADO PR fetcher init failed: {e}");
                return Ok(0);
            }
        };
        let conn = db.connection();
        let stored = fetcher
            .run_with_options(
                conn,
                messages.iter().map(String::as_str),
                self.force_refresh_prs,
            )
            .await?;
        Ok(stored)
    }
}

/// Best-effort detector for "this repo looks like a GitHub clone".
///
/// Why: when zero `pull_requests` end up in the DB after `tga collect`,
/// nine times out of ten the cause is "no `github:` block in the YAML".
/// Detecting this cheaply lets us emit one concrete remediation line
/// instead of leaving the operator to grep through the source. See issue
/// #211.
/// What: for each [`RepositoryConfig`], opens the repo via `git2`, reads
/// `origin`'s URL, and returns `true` as soon as one URL matches the
/// known GitHub forms (HTTPS, SSH, `ssh://git@github.com/...`). Any error
/// (no `origin`, unreadable repo, non-GitHub URL) is silently skipped so
/// this never fires a false positive in CI or test fixtures.
/// Test: covered indirectly — exercised by `tga collect` runs against
/// real clones; pure-string parsing is covered by
/// `crate::collect::github::client::extract_owner_repo_from_url`.
fn has_github_like_repos(repositories: &[crate::core::config::RepositoryConfig]) -> bool {
    for repo_cfg in repositories {
        let Ok(repo) = git2::Repository::open(&repo_cfg.path) else {
            continue;
        };
        let Ok(remote) = repo.find_remote("origin") else {
            continue;
        };
        let Some(url) = remote.url() else {
            continue;
        };
        if url.contains("github.com") {
            return true;
        }
    }
    false
}

/// Convert a calendar date to the UTC instant at 00:00:00 on that day.
fn naive_date_start_utc(d: NaiveDate) -> DateTime<Utc> {
    let ndt = d
        .and_hms_opt(0, 0, 0)
        .expect("00:00:00 is always a valid time");
    Utc.from_utc_datetime(&ndt)
}

/// Count commits where `author_id IS NULL` — these are commits whose author
/// identity could not be linked to a row in the `authors` table.
///
/// Why: see issue #68. Phantom identities silently inflate developer counts
/// in downstream reports, so we want to surface their existence loudly.
/// What: returns the COUNT(*) of NULL-author-id commits, or `Err` on a SQL
/// failure (callers should treat the error as best-effort and not abort).
/// Test: seed an in-memory DB with one commit whose author_id is NULL and
/// one with author_id set; assert the count is 1.
fn count_unresolved_commits(db: &Database) -> Result<usize> {
    let n: i64 = db
        .connection()
        .query_row(
            "SELECT COUNT(*) FROM commits WHERE author_id IS NULL",
            [],
            |r| r.get(0),
        )
        .map_err(crate::core::TgaError::from)?;
    Ok(n as usize)
}

/// Convert a calendar date to the UTC instant at 23:59:59 on that day.
fn naive_date_end_utc(d: NaiveDate) -> DateTime<Utc> {
    let ndt = d
        .and_hms_opt(23, 59, 59)
        .expect("23:59:59 is always a valid time");
    Utc.from_utc_datetime(&ndt)
}