tga 2.8.1

Developer productivity analytics — git commit collection, classification, and reporting
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
//! Async GitHub REST API v3 client for pull-request and issue metadata.

use rusqlite::params;
use tracing::{debug, warn};

use async_trait::async_trait;

use crate::collect::errors::Result;
use crate::collect::github::repo_resolver::{build_http_client, parse_slug};
use crate::collect::github::retry::retry_get;
use crate::collect::github::types::{ApiPull, GitHubIssue, GitHubPrCommit, GitHubReview};
use crate::collect::pr_provider::PrProvider;
use crate::core::config::GithubConfig;
use crate::core::db::Database;
use crate::core::models::{PrState, PullRequest};

/// GitHub REST API base URL.
pub(crate) const GITHUB_API_BASE: &str = "https://api.github.com";
/// Page size for paginated list endpoints (GitHub max is 100).
pub(crate) const PAGE_SIZE: u32 = 100;
/// HTTP `User-Agent` string sent on every request.
pub(crate) const USER_AGENT_VALUE: &str = "trusty-git-analytics/0.1";

/// Async GitHub REST client.
///
/// Supports single-repo and multi-repo PR collection. The `owner` / `repo`
/// pair is the "primary" repository used by issue-oriented endpoints
/// ([`Self::fetch_issue`], [`Self::list_issues`]). The `repos` vector lists
/// every repository the bulk PR fetcher will iterate over and always contains
/// the primary repo as the first entry when one is set.
pub struct GitHubClient {
    pub(crate) client: reqwest::Client,
    pub(crate) token: Option<String>,
    /// Primary `owner` for issue-oriented endpoints.
    pub(crate) owner: String,
    /// Primary `repo` for issue-oriented endpoints.
    pub(crate) repo: String,
    /// Every `(owner, repo)` pair the PR fetcher will scan, in order. Never
    /// empty in single-repo mode; may contain many entries in org / multi-repo
    /// mode (see [`Self::new_for_prs`]).
    pub(crate) repos: Vec<(String, String)>,
}

/// Compute the JSON-encoded `commit_shas` value for a PR row.
///
/// Why: GitHub populates `merge_commit_sha` even for open or
/// closed-without-merge PRs — it's the SHA of a *test* merge commit on
/// `refs/pull/N/merge` (a mergeability probe). That SHA exists on no
/// branch and won't join against the `commits` table (issue #101). Only
/// truly merged PRs (`merged_at` set) carry a joinable merge SHA.
/// What: returns `["<sha>"]` only when the PR is merged and has a SHA;
/// otherwise returns the empty array `[]`.
/// Test: see `commit_shas_gated_on_merged_at` — non-merged PR with a
/// populated SHA yields `"[]"`, merged PR yields `r#"["<sha>"]"#`.
pub(crate) fn commit_shas_for_pull(p: &ApiPull) -> Result<String> {
    match (&p.merge_commit_sha, p.merged_at.is_some()) {
        (Some(s), true) => Ok(serde_json::to_string(&vec![s.clone()])?),
        _ => Ok("[]".to_string()),
    }
}

impl GitHubClient {
    /// Build a client from a [`GithubConfig`].
    ///
    /// The config's `repo` field is expected in `owner/name` form. If the
    /// org-only mode is in use (`org` set, `repo` unset), per-repo calls
    /// will fail until a concrete repo is selected.
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Config`] if `repo` is missing or malformed.
    /// - [`crate::collect::errors::CollectError::Http`] if the underlying `reqwest::Client`
    ///   cannot be built.
    pub fn new(config: &GithubConfig) -> Result<Self> {
        use crate::collect::errors::CollectError;
        let repo_slug = config
            .repo
            .as_ref()
            .ok_or_else(|| CollectError::Config("github.repo is required (owner/name)".into()))?;
        let (owner, repo) = parse_slug(repo_slug)?;
        let http = build_http_client(config)?;

        Ok(Self {
            client: http,
            token: config.token.clone(),
            owner: owner.clone(),
            repo: repo.clone(),
            repos: vec![(owner, repo)],
        })
    }

    /// Construct a client that will fetch pull requests across every
    /// `(owner, repo)` in `repos`.
    ///
    /// Why: org-wide / multi-repo deployments need to drive PR collection
    /// from `repositories[]` (or `github.org` as fallback) rather than a
    /// single `github.repo`. Mirrors the ADO PR-fetcher contract from #84.
    /// What: stores the full list, uses the first entry as the "primary"
    /// for issue-oriented endpoints. Issue endpoints remain single-repo —
    /// the PM adapter still needs a concrete `owner/repo` to hit
    /// `GET /repos/{o}/{r}/issues/{n}`.
    /// Test: covered by `multi_repo_constructor_*` in `client_tests.rs`.
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Config`] if `repos` is empty.
    /// - [`crate::collect::errors::CollectError::Http`] if the underlying `reqwest::Client`
    ///   cannot be built.
    pub fn new_for_prs(config: &GithubConfig, repos: Vec<(String, String)>) -> Result<Self> {
        use crate::collect::errors::CollectError;
        if repos.is_empty() {
            return Err(CollectError::Config(
                "GitHubClient::new_for_prs requires at least one (owner, repo)".into(),
            ));
        }
        let (primary_owner, primary_repo) = repos[0].clone();
        let http = build_http_client(config)?;
        Ok(Self {
            client: http,
            token: config.token.clone(),
            owner: primary_owner,
            repo: primary_repo,
            repos,
        })
    }

    /// Construct a minimal authenticated client for fetching PR reviews only.
    ///
    /// Why: the reviewer-ingestion pass needs an authed client to call
    /// `fetch_pr_reviews_for_repo(owner, repo, pr_number)` without requiring
    /// a dummy repo slug (the old `new_for_prs("_dummy","_dummy")` workaround
    /// was fragile — it relied on the reviews method ignoring `self.owner`).
    /// What: builds the authed client; `owner`/`repo`/`repos` are left empty.
    /// Only use methods that take explicit `(owner, repo)` args.
    /// Test: `new_for_reviews_builds_without_dummy_slugs` in `client_tests.rs`.
    ///
    /// # Errors
    ///
    /// Returns [`crate::collect::errors::CollectError::Http`] if the `reqwest::Client`
    /// cannot be built.
    pub fn new_for_reviews(config: &GithubConfig) -> Result<Self> {
        let http = build_http_client(config)?;
        Ok(Self {
            client: http,
            token: config.token.clone(),
            owner: String::new(),
            repo: String::new(),
            repos: Vec::new(),
        })
    }

    /// Fetch all PRs (open + closed + merged) by paginating through the
    /// GitHub REST API.
    ///
    /// # Errors
    ///
    /// Returns [`crate::collect::errors::CollectError::Http`] on transport or
    /// non-success status, and [`crate::collect::errors::CollectError::Json`]
    /// on payload parse failures.
    pub async fn fetch_pull_requests(&self) -> Result<Vec<PullRequest>> {
        let mut out: Vec<PullRequest> = Vec::new();
        for (owner, repo) in &self.repos {
            match self.fetch_pull_requests_for_repo(owner, repo).await {
                Ok(mut prs) => out.append(&mut prs),
                Err(e) => {
                    // Partial-success semantics (issue #87): one bad repo
                    // (404, no token access, transient 5xx after retries)
                    // must not abort PR collection for the rest of the org.
                    warn!(
                        owner = %owner,
                        repo = %repo,
                        error = %e,
                        "GitHub PR fetch failed for repo; continuing with remaining repos"
                    );
                }
            }
        }
        Ok(out)
    }

    /// Fetch all PRs for a single `(owner, repo)` pair, paginating until
    /// exhausted. Internal helper for [`Self::fetch_pull_requests`].
    async fn fetch_pull_requests_for_repo(
        &self,
        owner: &str,
        repo: &str,
    ) -> Result<Vec<PullRequest>> {
        let mut out: Vec<PullRequest> = Vec::new();
        let mut page = 1u32;
        loop {
            let url = format!(
                "{GITHUB_API_BASE}/repos/{owner}/{repo}/pulls?state=all&per_page={PAGE_SIZE}&page={page}"
            );
            debug!(url = %url, "GET");
            let resp = self.retry_request(&url).await?;

            // Respect rate-limit hints.
            if let Some(rem) = resp
                .headers()
                .get("x-ratelimit-remaining")
                .and_then(|v| v.to_str().ok())
                .and_then(|s| s.parse::<u32>().ok())
            {
                if rem < 5 {
                    warn!(remaining = rem, "GitHub rate limit nearly exhausted");
                }
            }

            let resp = resp.error_for_status()?;
            let pulls: Vec<ApiPull> = resp.json().await?;
            if pulls.is_empty() {
                break;
            }
            let n = pulls.len();
            for p in pulls {
                let state = if p.merged_at.is_some() {
                    PrState::Merged
                } else if p.state == "closed" {
                    PrState::Closed
                } else {
                    PrState::Open
                };
                let commit_shas = commit_shas_for_pull(&p)?;
                out.push(PullRequest {
                    id: 0,
                    pr_number: p.number,
                    repository: format!("{owner}/{repo}"),
                    title: p.title,
                    author: p.user.map(|u| u.login).unwrap_or_default(),
                    state,
                    created_at: p.created_at,
                    merged_at: p.merged_at,
                    commit_shas,
                });
            }
            if (n as u32) < PAGE_SIZE {
                break;
            }
            page += 1;
        }
        Ok(out)
    }

    /// Persist a batch of [`PullRequest`] rows into the database.
    ///
    /// Why: `ON CONFLICT … DO UPDATE` keeps existing `id` so FK-linked
    /// `pr_reviewers` survive re-collection; `INSERT OR REPLACE` wiped them (#752).
    /// What: new rows insert all columns; existing update `title`/`author`/`state`/
    /// `merged_at`/`commit_shas`; `id` and `created_at` are never overwritten.
    /// Test: reviewer_store tests cover FK-preservation.
    ///
    /// # Errors
    ///
    /// Propagates [`crate::core::TgaError::DbError`] on SQL failures.
    pub fn store_pull_requests(
        &self,
        db: &Database,
        prs: &[PullRequest],
    ) -> crate::core::Result<usize> {
        let conn = db.connection();
        let mut count = 0usize;
        for pr in prs {
            conn.execute(
                "INSERT INTO pull_requests \
                 (provider,repository,pr_number,title,author,state,created_at,merged_at,commit_shas) \
                 VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9) \
                 ON CONFLICT(provider,repository,pr_number) DO UPDATE SET \
                   title=excluded.title,author=excluded.author,state=excluded.state,\
                   merged_at=excluded.merged_at,commit_shas=excluded.commit_shas",
                params![
                    "github",
                    pr.repository,
                    pr.pr_number as i64,
                    pr.title,
                    pr.author,
                    pr.state.as_str(),
                    pr.created_at.to_rfc3339(),
                    pr.merged_at.map(|t| t.to_rfc3339()),
                    pr.commit_shas,
                ],
            )?;
            count += 1;
        }
        Ok(count)
    }

    /// Whether this client was constructed with an authentication token.
    pub fn has_token(&self) -> bool {
        self.token.is_some()
    }

    /// Fetch a single issue by number from the GitHub REST API.
    ///
    /// Hits `GET /repos/{owner}/{repo}/issues/{number}`. Uses the same
    /// `Bearer` token (if any) as the bulk PR fetch.
    ///
    /// Returns `Ok(None)` when the API responds with `404 Not Found`
    /// (deleted or invisible issue). All other non-success statuses, as
    /// well as transport and JSON-parse failures, are propagated as
    /// [`crate::collect::errors::CollectError`].
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Http`] on transport or non-`404`
    ///   non-success HTTP responses.
    /// - [`crate::collect::errors::CollectError::Json`] on payload parse failures.
    pub async fn fetch_issue(&self, number: u64) -> Result<Option<GitHubIssue>> {
        let url = format!(
            "{GITHUB_API_BASE}/repos/{}/{}/issues/{number}",
            self.owner, self.repo
        );
        debug!(url = %url, "GET");
        let resp = self.client.get(&url).send().await?;

        if resp.status() == reqwest::StatusCode::NOT_FOUND {
            return Ok(None);
        }

        let resp = resp.error_for_status()?;
        let issue: GitHubIssue = resp.json().await?;
        Ok(Some(issue))
    }

    /// Send a GET request with exponential backoff on transient failures.
    ///
    /// Why: GitHub occasionally returns 502/504 under load and 429 when the
    /// per-token rate limit drains; a tiny retry loop avoids surfacing those
    /// as pipeline failures.
    /// What: delegates to the free [`retry_get`] helper, passing `self.client`.
    /// Test: covered indirectly by callers and by `wiremock` integration tests.
    async fn retry_request(&self, url: &str) -> Result<reqwest::Response> {
        retry_get(&self.client, url).await
    }

    /// Fetch all reviews for a given pull request, paginating until exhausted.
    ///
    /// Why: review counts, approval status, and review latency are core PR
    /// metrics; the bulk-PR endpoint omits reviews entirely. Taking explicit
    /// `(owner, repo)` rather than using `self.owner`/`self.repo` is
    /// critical for multi-repo clients where the primary owner/repo is
    /// unrelated to the PR being reviewed (issue #742 bug fix — the old
    /// signature silently fetched reviews from the wrong repo).
    /// What: `GET /repos/{owner}/{repo}/pulls/{pr_number}/reviews?per_page=100`,
    /// looping pages until a short page indicates end-of-list.
    /// Test: deserialization shape covered by `github_review_deserializes`;
    /// correct routing verified by the reviewer-ingestion integration path.
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Http`] on transport / non-success
    ///   HTTP responses after retries are exhausted.
    /// - [`crate::collect::errors::CollectError::Json`] on payload parse failures.
    pub async fn fetch_pr_reviews_for_repo(
        &self,
        owner: &str,
        repo: &str,
        pr_number: u64,
    ) -> Result<Vec<GitHubReview>> {
        let mut out = Vec::new();
        let mut page = 1u32;
        loop {
            let url = format!(
                "{GITHUB_API_BASE}/repos/{owner}/{repo}/pulls/{pr_number}/reviews?per_page={PAGE_SIZE}&page={page}"
            );
            let resp = self.retry_request(&url).await?.error_for_status()?;
            let batch: Vec<GitHubReview> = resp.json().await?;
            let n = batch.len();
            out.extend(batch);
            if (n as u32) < PAGE_SIZE {
                break;
            }
            page += 1;
        }
        Ok(out)
    }

    /// Expose the internal HTTP client for org-discovery requests.
    ///
    /// Why: `discover_org_repos` lives in a sibling module and needs the
    /// same authenticated `reqwest::Client` without duplicating the header
    /// build logic.
    /// What: returns a shared reference to the underlying `reqwest::Client`.
    /// Test: used by the reviewer-ingestion path in `collector.rs`.
    pub fn http_client(&self) -> &reqwest::Client {
        &self.client
    }

    /// Fetch all commits attached to a pull request, paginating until exhausted.
    ///
    /// Why: PR-level commit lists let us attribute work to the PR author and
    /// reconstruct review-window churn even when the merge commit alone is
    /// recorded on the default branch.
    /// What: `GET /repos/{owner}/{repo}/pulls/{pr_number}/commits?per_page=100`.
    /// Test: deserialization shape covered by `github_pr_commit_deserializes`.
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Http`] on transport / non-success
    ///   HTTP responses after retries are exhausted.
    /// - [`crate::collect::errors::CollectError::Json`] on payload parse failures.
    pub async fn fetch_pr_commits(&self, pr_number: u64) -> Result<Vec<GitHubPrCommit>> {
        let mut out = Vec::new();
        let mut page = 1u32;
        loop {
            let url = format!(
                "{GITHUB_API_BASE}/repos/{}/{}/pulls/{pr_number}/commits?per_page={PAGE_SIZE}&page={page}",
                self.owner, self.repo
            );
            let resp = self.retry_request(&url).await?.error_for_status()?;
            let batch: Vec<GitHubPrCommit> = resp.json().await?;
            let n = batch.len();
            out.extend(batch);
            if (n as u32) < PAGE_SIZE {
                break;
            }
            page += 1;
        }
        Ok(out)
    }

    /// List issues on the configured repository, paginating until exhausted.
    ///
    /// Note: the GitHub `issues` endpoint includes pull requests in its
    /// response. Callers needing pure issues should call [`Self::fetch_pull_requests`]
    /// for PR-specific work.
    ///
    /// Why: bulk issue listing is needed for backfilling ticket metadata
    /// when commit messages reference `#NNN` without a project prefix.
    /// What: `GET /repos/{owner}/{repo}/issues?state={state}&since={since}&per_page=100`.
    /// Test: integration-tested via the `pm` adapter suite; deserialization
    /// reuses `GitHubIssue` whose shape is unit-tested above.
    ///
    /// # Arguments
    ///
    /// * `state` — one of `"open"`, `"closed"`, or `"all"`.
    /// * `since` — optional ISO8601 timestamp; only issues updated at or
    ///   after this time are returned.
    ///
    /// # Errors
    ///
    /// - [`crate::collect::errors::CollectError::Http`] on transport / non-success
    ///   HTTP responses after retries are exhausted.
    /// - [`crate::collect::errors::CollectError::Json`] on payload parse failures.
    pub async fn list_issues(&self, state: &str, since: Option<&str>) -> Result<Vec<GitHubIssue>> {
        let mut out = Vec::new();
        let mut page = 1u32;
        loop {
            let mut url = format!(
                "{GITHUB_API_BASE}/repos/{}/{}/issues?state={state}&per_page={PAGE_SIZE}&page={page}",
                self.owner, self.repo
            );
            if let Some(s) = since {
                url.push_str("&since=");
                url.push_str(s);
            }
            let resp = self.retry_request(&url).await?.error_for_status()?;
            let batch: Vec<GitHubIssue> = resp.json().await?;
            let n = batch.len();
            out.extend(batch);
            if (n as u32) < PAGE_SIZE {
                break;
            }
            page += 1;
        }
        Ok(out)
    }
}

#[async_trait]
impl PrProvider for GitHubClient {
    fn name(&self) -> &str {
        "github"
    }

    async fn fetch_pull_requests(&self) -> Result<Vec<PullRequest>> {
        GitHubClient::fetch_pull_requests(self).await
    }

    fn store_pull_requests(
        &self,
        db: &Database,
        prs: &[PullRequest],
    ) -> crate::core::Result<usize> {
        GitHubClient::store_pull_requests(self, db, prs)
    }
}

#[cfg(test)]
#[path = "client_tests.rs"]
mod tests;