repo-trust 0.1.1

A command-line tool that tells you whether an open-source repository deserves your trust — beyond the star count.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
//! `scan` — evaluate a single repository.
//!
//! Day 1 partial wire: only Activity Health runs end-to-end. The remaining
//! modules (maintainers, security, stars, adoption) land Day 2/3.

use std::time::Instant;

use anyhow::{Context, Result};
use clap::Args;
use time::OffsetDateTime;

use crate::api::deps_dev::Client as DepsDevClient;
use crate::api::github::Client as GhClient;
use crate::api::github::GithubError;
use crate::api::osv::Client as OsvClient;
use crate::api::scorecard::Client as ScorecardClient;
use crate::config;
use crate::models::{
    Category, ModuleResult, ModuleWeights, RepositoryContext, RepositorySummary, TrustReport,
};
use crate::reports::json_report;
use crate::scoring::{aggregate, overall_confidence};
use crate::storage::Cache;
use crate::utils::{ratelimit::RateLimiter, repo_url};

#[derive(Debug, Args)]
pub struct ScanArgs {
    /// Repository identifier: `owner/repo` or full GitHub URL.
    pub repo: String,

    /// Execution mode.
    #[arg(long, value_enum, default_value_t = Mode::Standard)]
    pub mode: Mode,

    /// Comma-separated list of modules to enable (default: all).
    #[arg(long, value_delimiter = ',')]
    pub modules: Option<Vec<String>>,

    /// Comma-separated list of modules to skip.
    #[arg(long, value_delimiter = ',')]
    pub skip_modules: Option<Vec<String>>,

    /// Output directory for written report files.
    #[arg(long, default_value = "./repo-trust-reports")]
    pub output: std::path::PathBuf,

    /// Output formats to write (terminal is always shown unless --quiet).
    #[arg(long, value_delimiter = ',', value_enum)]
    pub format: Vec<Format>,

    /// Path to a TOML file with custom module weights.
    #[arg(long)]
    pub weights: Option<std::path::PathBuf>,

    /// Pin a specific scoring version. If unset, latest is used.
    #[arg(long)]
    pub scoring_version: Option<String>,

    /// GitHub Personal Access Token. Prefer the `GITHUB_TOKEN` env var.
    #[arg(long, env = "GITHUB_TOKEN", hide_env_values = true)]
    pub token: Option<String>,

    /// RNG seed for sampling (deterministic output). Default derived from repo+scoring_version.
    #[arg(long)]
    pub seed: Option<u64>,

    /// Invalidate all cache entries for this repo before scanning.
    #[arg(long)]
    pub refresh: bool,

    /// Invalidate cache for a specific module before scanning.
    #[arg(long)]
    pub refresh_module: Option<String>,

    /// Verbose tracing logs (sets RUST_LOG=debug).
    #[arg(long)]
    pub debug: bool,

    /// Suppress progress output.
    #[arg(long)]
    pub quiet: bool,

    /// Disable terminal colors.
    #[arg(long)]
    pub no_color: bool,

    /// Shorthand for `--format json --quiet`.
    #[arg(long)]
    pub json: bool,

    /// Override the GitHub API base URL. Hidden — used by integration tests
    /// to point at a wiremock server.
    #[arg(long, hide = true, env = "REPO_TRUST_API_BASE_URL")]
    pub api_base_url: Option<String>,

    /// Pin the scan's `snapshot_at` to a fixed RFC 3339 / ISO 8601 instant.
    /// Hidden — used by snapshot tests so that evidence values derived from
    /// `now - commit_date` (e.g. `days_since_last_commit`) stay stable
    /// across CI runs on different days. Production scans always use the
    /// wall clock.
    #[arg(long, hide = true, env = "REPO_TRUST_SNAPSHOT_AT")]
    pub snapshot_at: Option<String>,
}

#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum Mode {
    /// < 5s, < 30 API calls, headline signals only.
    Quick,
    /// < 30s, < 200 API calls, all modules at default sampling.
    Standard,
    /// < 5min, < 2000 API calls, larger sampling and graph analysis.
    Deep,
}

impl Mode {
    fn as_str(self) -> &'static str {
        match self {
            Self::Quick => "quick",
            Self::Standard => "standard",
            Self::Deep => "deep",
        }
    }
}

#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum Format {
    Terminal,
    Json,
    Md,
    Csv,
    Sarif,
}

pub async fn execute(args: ScanArgs) -> Result<u8> {
    let started = Instant::now();
    tracing::info!(repo = %args.repo, mode = ?args.mode, "scan starting");

    // ─── Repo URL ──────────────────────────────────────────────────────
    let full_name = repo_url::parse(&args.repo).context("invalid repo identifier")?;
    let canonical_url = url::Url::parse(&format!("https://github.com/{full_name}"))?;

    // ─── Config ────────────────────────────────────────────────────────
    let cfg = config::load::<()>(None).context("loading config")?;
    let token = args.token.clone().or_else(|| cfg.github.resolve_token());
    if token.is_none() {
        tracing::warn!("no GitHub token configured; running unauthenticated (60 req/h limit)");
    }
    let weights = if let Some(p) = &args.weights {
        crate::scoring::weights::load(p).context("loading custom weights")?
    } else {
        ModuleWeights::from(cfg.weights)
    };

    // ─── Cache ─────────────────────────────────────────────────────────
    let cache_path = cfg.cache.resolved_path();
    let cache = Cache::open(&cache_path).context("opening cache")?;
    if args.refresh {
        let n = cache.delete_by_repo(&full_name)?;
        tracing::info!(invalidated = n, "cache invalidated for repo");
    }

    // ─── HTTP / federated clients ────────────────────────────────────
    let http = crate::api::client::build()?;
    let limiter = RateLimiter::default();
    let mut github = GhClient::new(http.clone(), cache.clone(), limiter, token);
    let mut scorecard = ScorecardClient::new(http.clone(), cache.clone());
    let mut osv = OsvClient::new(http.clone(), cache.clone());
    let mut deps_dev = DepsDevClient::new(http.clone(), cache.clone());
    if let Some(base) = args.api_base_url.as_deref() {
        github = github.with_base_url(base);
        // The same wiremock server hosts the federated mocks in tests; in
        // production the federated clients hit their real endpoints.
        scorecard = scorecard.with_base_url(base);
        osv = osv.with_base_url(base);
        deps_dev = deps_dev.with_base_url(base);
    }

    // ─── Scoring version + seed ────────────────────────────────────────
    let scoring_version = match &args.scoring_version {
        Some(s) => semver::Version::parse(s).context("invalid scoring version")?,
        None => semver::Version::parse(crate::SCORING_VERSION)
            .expect("crate SCORING_VERSION is valid SemVer"),
    };
    let rng_seed = args.seed.unwrap_or_else(|| {
        crate::utils::sampling::derive_seed(&full_name, &scoring_version.to_string())
    });
    let snapshot_at = match &args.snapshot_at {
        Some(s) => {
            time::OffsetDateTime::parse(s, &time::format_description::well_known::Iso8601::DEFAULT)
                .context("--snapshot-at must be ISO 8601 (e.g. 2026-05-03T12:00:00Z)")?
        },
        None => OffsetDateTime::now_utc(),
    };

    // ─── Build context ────────────────────────────────────────────────
    let ctx = RepositoryContext {
        full_name: full_name.clone(),
        canonical_url,
        mode: args.mode,
        scoring_version: scoring_version.clone(),
        weights,
        rng_seed,
        snapshot_at,
        cache,
        github: github.clone(),
        scorecard,
        osv,
        deps_dev,
    };

    // ─── Run modules ──────────────────────────────────────────────────
    // Day 3: all 5 modules wired end-to-end. Default set in select_modules
    // covers everything; users can subset via --modules / --skip-modules.
    let selected = select_modules(args.modules.as_ref(), args.skip_modules.as_ref());
    let mut module_results: Vec<ModuleResult> = Vec::new();
    let mut all_evidence = Vec::new();

    use crate::modules::TrustModule;
    for name in &selected {
        let result = match name.as_str() {
            "activity" => {
                let m = crate::modules::activity::ActivityModule;
                Some(m.run(&ctx).await)
            },
            "maintainers" => {
                let m = crate::modules::maintainers::MaintainersModule;
                Some(m.run(&ctx).await)
            },
            "security" => {
                let m = crate::modules::security::SecurityModule;
                Some(m.run(&ctx).await)
            },
            "stars" => {
                let m = crate::modules::stars::StarsModule;
                Some(m.run(&ctx).await)
            },
            "adoption" => {
                let m = crate::modules::adoption::AdoptionModule;
                Some(m.run(&ctx).await)
            },
            other => {
                tracing::debug!(module = other, "unknown module name; skipping");
                None
            },
        };
        if let Some(res) = result {
            let (r, ev) = res.with_context(|| format!("module '{name}' failed"))?;
            module_results.push(r);
            all_evidence.extend(ev);
        }
    }

    // ─── Aggregate ────────────────────────────────────────────────────
    let overall_score = aggregate(&module_results, &ctx.weights);
    let overall_conf = overall_confidence(&module_results, &ctx.weights);
    let category = Category::from_score(overall_score);
    let (top_strengths, top_concerns) =
        crate::scoring::explain::top_strengths_and_concerns(&all_evidence, 3);

    // ─── Repo summary (cheap re-fetch from cache; metadata was warmed by activity) ─
    let (owner, name) = full_name
        .split_once('/')
        .ok_or_else(|| anyhow::anyhow!("invalid full_name"))?;
    let summary = match github.get_repo(owner, name).await {
        Ok(r) => RepositorySummary {
            full_name: r.full_name,
            url: r.html_url,
            default_branch: r.default_branch,
            primary_language: r.language,
            stars: r.stargazers_count,
            snapshot_at,
        },
        Err(e) => {
            // Map typed error → exit code per architecture §8.
            return Err(map_github_error(&e).unwrap_or(e));
        },
    };

    // ─── Build report ─────────────────────────────────────────────────
    let runtime_seconds = started.elapsed().as_secs_f64();
    let mut evidence_sorted = all_evidence;
    evidence_sorted.sort_by(|a, b| {
        (a.module.as_str(), a.code.as_str()).cmp(&(b.module.as_str(), b.code.as_str()))
    });

    let report = TrustReport {
        schema_version: crate::REPORT_SCHEMA_VERSION.to_string(),
        repository: summary,
        overall_score,
        overall_confidence: overall_conf,
        category,
        mode: match args.mode {
            Mode::Quick => crate::models::Mode::Quick,
            Mode::Standard => crate::models::Mode::Standard,
            Mode::Deep => crate::models::Mode::Deep,
        },
        modules: module_results,
        evidence: evidence_sorted,
        top_strengths,
        top_concerns,
        caveats: Vec::new(),
        scoring_version: scoring_version.to_string(),
        weights_used: ctx.weights,
        snapshot_at,
        runtime_seconds: crate::utils::time::round6(runtime_seconds),
    };

    // ─── Write reports ────────────────────────────────────────────────
    std::fs::create_dir_all(&args.output)
        .with_context(|| format!("creating output dir {:?}", args.output))?;
    let safe = full_name.replace('/', "_");
    let formats = resolve_formats(&args, &cfg.output.default_formats);

    // Always cache the report regardless of which writers ran.
    let json_bytes = serde_json::to_vec(&report)?;
    ctx.cache.put_report(
        &full_name,
        args.mode.as_str(),
        &scoring_version.to_string(),
        &json_bytes,
    )?;

    let mut wrote: Vec<std::path::PathBuf> = Vec::new();
    for fmt in &formats {
        match fmt {
            Format::Json => {
                let p = args.output.join(format!("{safe}.json"));
                json_report::write(&report, &p)?;
                wrote.push(p);
            },
            Format::Md => {
                let p = args.output.join(format!("{safe}.md"));
                crate::reports::markdown_report::write(&report, &p)?;
                wrote.push(p);
            },
            Format::Csv => {
                let p = args.output.join(format!("{safe}.csv"));
                crate::reports::csv_report::write(&report, &p)?;
                wrote.push(p);
            },
            Format::Terminal => {
                // Render to stdout unless --quiet.
                if !args.quiet {
                    let stdout = std::io::stdout();
                    let mut handle = stdout.lock();
                    crate::reports::terminal::write(&report, &mut handle, !args.no_color)?;
                }
            },
            Format::Sarif => {
                tracing::warn!("SARIF output deferred to v1.1; skipping");
            },
        }
    }

    if !args.quiet {
        for p in &wrote {
            println!("wrote {}", p.display());
        }
        println!(
            "score {} / {}, confidence {:?}",
            report.overall_score,
            mode_label(args.mode),
            report.overall_confidence,
        );
    }

    Ok(0)
}

/// Resolve the effective format list. Precedence:
/// 1. `--json` short-hand → `[Json]`.
/// 2. Explicit `--format` flags → those values.
/// 3. `[output] default_formats` config → parsed.
fn resolve_formats(args: &ScanArgs, default_formats: &[String]) -> Vec<Format> {
    if args.json {
        return vec![Format::Json];
    }
    if !args.format.is_empty() {
        return args.format.clone();
    }
    let mut out = Vec::new();
    for s in default_formats {
        match s.as_str() {
            "terminal" => out.push(Format::Terminal),
            "json" => out.push(Format::Json),
            "md" | "markdown" => out.push(Format::Md),
            "csv" => out.push(Format::Csv),
            "sarif" => out.push(Format::Sarif),
            other => tracing::debug!(format = other, "unknown format in config; skipping"),
        }
    }
    if out.is_empty() {
        out.push(Format::Json); // Always produce something writable for tests.
    }
    out
}

fn mode_label(m: Mode) -> &'static str {
    m.as_str()
}

fn select_modules(enabled: Option<&Vec<String>>, skipped: Option<&Vec<String>>) -> Vec<String> {
    // Day 3 default set: all 5 modules wired end-to-end.
    let default_set = vec![
        "stars".to_string(),
        "activity".to_string(),
        "maintainers".to_string(),
        "adoption".to_string(),
        "security".to_string(),
    ];
    let mut selected: Vec<String> = match enabled {
        Some(list) if !list.is_empty() => list.clone(),
        _ => default_set,
    };
    if let Some(skip) = skipped {
        selected.retain(|m| !skip.iter().any(|s| s == m));
    }
    selected
}

fn map_github_error(e: &anyhow::Error) -> Option<anyhow::Error> {
    // Surface as the original anyhow::Error — exit-code mapping happens in
    // cli::run via downcast in a future refactor. For Day 1 we just propagate.
    let _ = e;
    None
}

/// Map a `GithubError` to the architecture-§8 exit code.
#[must_use]
pub fn exit_code_for(error: &anyhow::Error) -> u8 {
    match error.downcast_ref::<GithubError>() {
        Some(GithubError::NotFound) => 2,
        Some(GithubError::Unauthorized) => 3,
        Some(GithubError::Forbidden(_)) => 4,
        _ => 1,
    }
}