use std::collections::HashSet;
use std::path::PathBuf;
use std::time::Duration;
use serde::Deserialize;
use tracing::warn;
use crate::registry::package::PackageMetadata;
use crate::types::{Finding, FindingCategory, Severity};
const REQUEST_TIMEOUT: Duration = Duration::from_secs(15);
#[derive(Debug, Deserialize)]
struct GitHubTree {
#[serde(default)]
tree: Vec<GitHubTreeEntry>,
#[serde(default)]
truncated: bool,
}
#[derive(Debug, Deserialize)]
struct GitHubTreeEntry {
path: String,
#[serde(rename = "type")]
entry_type: String,
}
#[derive(Debug, Clone)]
struct GitHubRepo {
owner: String,
repo: String,
}
fn is_safe_github_component(s: &str) -> bool {
if s.is_empty() {
return false;
}
if s.contains("..") || s.contains("://") || s.contains('@') {
return false;
}
s.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.')
}
fn is_safe_git_ref(s: &str) -> bool {
if s.is_empty() {
return false;
}
if s.contains("..") || s.contains("://") || s.contains('@') {
return false;
}
if s.chars()
.any(|c| c.is_ascii_whitespace() || c.is_ascii_control())
{
return false;
}
s.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.' || c == '/')
}
fn parse_github_repo(repository: &serde_json::Value) -> Option<GitHubRepo> {
let url_str = match repository {
serde_json::Value::String(s) => s.clone(),
serde_json::Value::Object(obj) => {
obj.get("url")
.and_then(|v| v.as_str())
.map(|s| s.to_string())?
}
_ => return None,
};
parse_github_url(&url_str)
}
fn parse_github_url(raw: &str) -> Option<GitHubRepo> {
let s = raw.trim();
if let Some(rest) = s.strip_prefix("github:") {
return parse_owner_repo(rest);
}
if s.starts_with("git@github.com:") {
let rest = s.strip_prefix("git@github.com:")?;
return parse_owner_repo(rest);
}
let normalized = s
.replace("git+https://", "https://")
.replace("git+ssh://", "ssh://")
.replace("git://", "https://")
.replace("ssh://git@", "https://");
if let Some(idx) = normalized.find("github.com/") {
let after = &normalized[idx + "github.com/".len()..];
return parse_owner_repo(after);
}
if !s.contains("://") && !s.contains(':') && !s.contains('.') {
let parts: Vec<&str> = s.split('/').collect();
if parts.len() == 2 && !parts[0].is_empty() && !parts[1].is_empty() {
let owner = parts[0].to_string();
let repo = parts[1].trim_end_matches(".git").to_string();
if !is_safe_github_component(&owner) || !is_safe_github_component(&repo) {
warn!(
"Rejected unsafe GitHub component: owner={:?}, repo={:?}",
owner, repo
);
return None;
}
return Some(GitHubRepo { owner, repo });
}
}
None
}
fn parse_owner_repo(s: &str) -> Option<GitHubRepo> {
let trimmed = s.trim().trim_end_matches('/');
let parts: Vec<&str> = trimmed.split('/').collect();
if parts.len() >= 2 && !parts[0].is_empty() && !parts[1].is_empty() {
let owner = parts[0].to_string();
let repo = parts[1]
.trim_end_matches(".git")
.trim_end_matches('/')
.to_string();
if !is_safe_github_component(&owner) || !is_safe_github_component(&repo) {
warn!(
"Rejected unsafe GitHub component: owner={:?}, repo={:?}",
owner, repo
);
return None;
}
return Some(GitHubRepo { owner, repo });
}
None
}
const JS_EXTENSIONS: &[&str] = &["js", "mjs", "cjs", "ts"];
fn is_js_file(path: &str) -> bool {
if let Some(ext) = path.rsplit('.').next() {
JS_EXTENSIONS.contains(&ext)
} else {
false
}
}
pub struct ProvenanceAnalyzer {
client: reqwest::Client,
}
impl Default for ProvenanceAnalyzer {
fn default() -> Self {
Self::new()
}
}
impl ProvenanceAnalyzer {
pub fn new() -> Self {
let mut headers = reqwest::header::HeaderMap::new();
if let Ok(token) = std::env::var("GITHUB_TOKEN") {
if let Ok(value) = reqwest::header::HeaderValue::from_str(&format!("Bearer {token}")) {
headers.insert(reqwest::header::AUTHORIZATION, value);
}
}
let client = reqwest::Client::builder()
.timeout(REQUEST_TIMEOUT)
.user_agent(concat!("aegis-cli/", env!("CARGO_PKG_VERSION")))
.default_headers(headers)
.build()
.unwrap_or_default();
Self { client }
}
pub async fn analyze_ctx(&self, ctx: &crate::types::AnalysisContext<'_>) -> Vec<Finding> {
self.analyze(ctx.files, ctx.package_json, ctx.metadata, ctx.version)
.await
}
pub async fn analyze(
&self,
files: &[(PathBuf, String)],
package_json: &serde_json::Value,
metadata: &PackageMetadata,
version: &str,
) -> Vec<Finding> {
let mut findings = Vec::new();
let pkg_name = metadata
.name
.as_deref()
.or_else(|| package_json.get("name").and_then(|v| v.as_str()))
.unwrap_or("<unknown>");
let has_attestation = self.check_attestation(metadata, version, &mut findings, pkg_name);
let repo_field = package_json.get("repository");
let github_repo = repo_field.and_then(parse_github_repo);
if github_repo.is_none() {
if !has_attestation {
findings.push(Finding {
severity: Severity::High,
category: FindingCategory::Provenance,
title: "No provenance attestation and no repository URL".into(),
description: format!(
"Package `{pkg_name}@{version}` has no npm provenance attestation \
and no repository URL in package.json. There is no way to verify \
the source of the published code."
),
file: None,
line: None,
snippet: None,
});
}
return findings;
}
let github_repo = github_repo.unwrap();
match self
.compare_with_github(files, &github_repo, version, pkg_name)
.await
{
Ok(comparison_findings) => findings.extend(comparison_findings),
Err(CompareError::RepoNotAccessible) => {
findings.push(Finding {
severity: Severity::Medium,
category: FindingCategory::Provenance,
title: "Repository URL not accessible".into(),
description: format!(
"Package `{pkg_name}@{version}` declares repository \
https://github.com/{}/{} but it returned a 404. \
The source code cannot be verified.",
github_repo.owner, github_repo.repo,
),
file: None,
line: None,
snippet: None,
});
}
Err(CompareError::ApiError(msg)) => {
warn!("GitHub API error during provenance check for {pkg_name}@{version}: {msg}");
}
}
findings
}
fn check_attestation(
&self,
metadata: &PackageMetadata,
version: &str,
findings: &mut Vec<Finding>,
pkg_name: &str,
) -> bool {
let version_info = metadata.versions.get(version);
let has_attestation = version_info
.map(|vi| {
let in_dist = vi
.extra
.get("dist")
.and_then(|d| d.get("attestations"))
.is_some();
let in_extra = vi.extra.contains_key("attestations");
let has_signature = vi.extra.contains_key("_npmSignature");
in_dist || in_extra || has_signature
})
.unwrap_or(false);
if has_attestation {
findings.push(Finding {
severity: Severity::Info,
category: FindingCategory::Provenance,
title: "npm provenance attestation present".into(),
description: format!(
"Package `{pkg_name}@{version}` has npm provenance attestation \
(Sigstore-based). This links the published package to its source \
repository and build process."
),
file: None,
line: None,
snippet: None,
});
} else {
findings.push(Finding {
severity: Severity::Low,
category: FindingCategory::Provenance,
title: "No npm provenance attestation".into(),
description: format!(
"Package `{pkg_name}@{version}` does not have npm provenance attestation. \
Provenance links a published package to its source repo and build, \
making supply-chain attacks harder."
),
file: None,
line: None,
snippet: None,
});
}
has_attestation
}
async fn compare_with_github(
&self,
files: &[(PathBuf, String)],
repo: &GitHubRepo,
version: &str,
pkg_name: &str,
) -> Result<Vec<Finding>, CompareError> {
let tarball_js_files: HashSet<String> = files
.iter()
.map(|(p, _)| p.to_string_lossy().to_string())
.filter(|p| is_js_file(p))
.collect();
if tarball_js_files.is_empty() {
return Ok(Vec::new());
}
let tag_candidates = vec![
format!("v{version}"),
version.to_string(),
format!("{pkg_name}@{version}"),
];
let mut github_tree = None;
for tag in &tag_candidates {
match self.fetch_github_tree(repo, tag).await {
Ok(tree) => {
github_tree = Some(tree);
break;
}
Err(CompareError::RepoNotAccessible) => {
continue;
}
Err(e) => return Err(e),
}
}
if github_tree.is_none() {
match self.fetch_github_tree(repo, "HEAD").await {
Ok(tree) => github_tree = Some(tree),
Err(CompareError::RepoNotAccessible) => {
return Err(CompareError::RepoNotAccessible);
}
Err(e) => return Err(e),
}
}
let github_tree = match github_tree {
Some(t) => t,
None => return Err(CompareError::RepoNotAccessible),
};
let github_js_files: HashSet<String> = github_tree
.tree
.iter()
.filter(|e| e.entry_type == "blob")
.map(|e| e.path.clone())
.filter(|p| is_js_file(p))
.collect();
let mut findings = Vec::new();
let mut npm_only: Vec<&String> = tarball_js_files
.iter()
.filter(|f| !github_js_files.contains(f.as_str()))
.collect();
npm_only.sort();
let npm_only_suspicious: Vec<&&String> = npm_only
.iter()
.filter(|f| !is_expected_build_artifact(f))
.collect();
if !npm_only_suspicious.is_empty() {
let file_list: Vec<String> = npm_only_suspicious
.iter()
.take(20)
.map(|f| format!(" - {f}"))
.collect();
let truncation_note = if npm_only_suspicious.len() > 20 {
format!("\n ... and {} more", npm_only_suspicious.len() - 20)
} else {
String::new()
};
findings.push(Finding {
severity: Severity::Critical,
category: FindingCategory::Provenance,
title: format!(
"{} JS file(s) in npm tarball not found in GitHub source",
npm_only_suspicious.len()
),
description: format!(
"Package `{pkg_name}` has JavaScript files in the npm tarball that \
do not exist in the GitHub repository ({}/{}). This could indicate \
injected malicious code:\n{}{truncation_note}",
repo.owner,
repo.repo,
file_list.join("\n"),
),
file: npm_only_suspicious.first().map(|f| f.to_string()),
line: None,
snippet: None,
});
}
if github_tree.truncated {
findings.push(Finding {
severity: Severity::Info,
category: FindingCategory::Provenance,
title: "GitHub tree response was truncated".into(),
description: format!(
"The GitHub tree API response for {}/{} was truncated (repository \
has too many files). The file comparison may be incomplete.",
repo.owner, repo.repo,
),
file: None,
line: None,
snippet: None,
});
}
Ok(findings)
}
async fn fetch_github_tree(
&self,
repo: &GitHubRepo,
git_ref: &str,
) -> Result<GitHubTree, CompareError> {
if !is_safe_github_component(&repo.owner) || !is_safe_github_component(&repo.repo) {
return Err(CompareError::ApiError(format!(
"Unsafe GitHub owner/repo: {}/{}",
repo.owner, repo.repo,
)));
}
if !is_safe_git_ref(git_ref) {
return Err(CompareError::ApiError(format!(
"Unsafe git ref: {:?}",
git_ref,
)));
}
let url = format!(
"https://api.github.com/repos/{}/{}/git/trees/{}?recursive=1",
repo.owner, repo.repo, git_ref,
);
let response = self
.client
.get(&url)
.send()
.await
.map_err(|e| CompareError::ApiError(format!("HTTP request failed: {e}")))?;
if response.status() == reqwest::StatusCode::NOT_FOUND {
return Err(CompareError::RepoNotAccessible);
}
if !response.status().is_success() {
return Err(CompareError::ApiError(format!(
"GitHub API returned {}",
response.status()
)));
}
let tree: GitHubTree = response
.json()
.await
.map_err(|e| CompareError::ApiError(format!("Failed to parse GitHub tree: {e}")))?;
Ok(tree)
}
}
enum CompareError {
RepoNotAccessible,
ApiError(String),
}
fn is_expected_build_artifact(path: &str) -> bool {
let lower = path.to_lowercase();
let build_prefixes = [
"dist/", "build/", "lib/", "out/", "cjs/", "esm/", "umd/", "es/", "module/", "_cjs/",
"_esm/",
];
for prefix in &build_prefixes {
if lower.starts_with(prefix) {
return true;
}
}
if lower.ends_with(".min.js") || lower.ends_with(".bundle.js") {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_https_url() {
let v = serde_json::json!("https://github.com/axios/axios");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "axios");
assert_eq!(repo.repo, "axios");
}
#[test]
fn parse_https_url_with_git_suffix() {
let v = serde_json::json!("https://github.com/lodash/lodash.git");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "lodash");
assert_eq!(repo.repo, "lodash");
}
#[test]
fn parse_git_plus_https() {
let v = serde_json::json!("git+https://github.com/user/repo.git");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "user");
assert_eq!(repo.repo, "repo");
}
#[test]
fn parse_object_format() {
let v = serde_json::json!({"type": "git", "url": "https://github.com/facebook/react.git"});
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "facebook");
assert_eq!(repo.repo, "react");
}
#[test]
fn parse_github_shorthand() {
let v = serde_json::json!("github:user/repo");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "user");
assert_eq!(repo.repo, "repo");
}
#[test]
fn parse_bare_shorthand() {
let v = serde_json::json!("user/repo");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "user");
assert_eq!(repo.repo, "repo");
}
#[test]
fn parse_ssh_url() {
let v = serde_json::json!("git@github.com:user/repo.git");
let repo = parse_github_repo(&v).unwrap();
assert_eq!(repo.owner, "user");
assert_eq!(repo.repo, "repo");
}
#[test]
fn parse_non_github_returns_none() {
let v = serde_json::json!("https://gitlab.com/user/repo");
assert!(parse_github_repo(&v).is_none());
}
#[test]
fn is_js_file_positive() {
assert!(is_js_file("index.js"));
assert!(is_js_file("src/main.mjs"));
assert!(is_js_file("lib/util.cjs"));
assert!(is_js_file("types.ts"));
}
#[test]
fn is_js_file_negative() {
assert!(!is_js_file("readme.md"));
assert!(!is_js_file("package.json"));
assert!(!is_js_file("logo.png"));
}
#[test]
fn build_artifact_detection() {
assert!(is_expected_build_artifact("dist/index.js"));
assert!(is_expected_build_artifact("build/main.js"));
assert!(is_expected_build_artifact("lib/utils.js"));
assert!(is_expected_build_artifact("jquery.min.js"));
assert!(is_expected_build_artifact("app.bundle.js"));
assert!(!is_expected_build_artifact("src/index.js"));
assert!(!is_expected_build_artifact("index.js"));
}
#[test]
fn safe_github_component_accepts_normal_names() {
assert!(is_safe_github_component("axios"));
assert!(is_safe_github_component("facebook"));
assert!(is_safe_github_component("my-repo"));
assert!(is_safe_github_component("my_repo"));
assert!(is_safe_github_component("lodash.js"));
assert!(is_safe_github_component("repo123"));
}
#[test]
fn safe_github_component_rejects_malicious_input() {
assert!(!is_safe_github_component("../../../etc/passwd"));
assert!(!is_safe_github_component(".."));
assert!(!is_safe_github_component("foo..bar"));
assert!(!is_safe_github_component("http://evil.com"));
assert!(!is_safe_github_component("https://evil.com"));
assert!(!is_safe_github_component("git@evil.com"));
assert!(!is_safe_github_component("foo bar"));
assert!(!is_safe_github_component("foo\tbar"));
assert!(!is_safe_github_component("foo\nbar"));
assert!(!is_safe_github_component("foo\0bar"));
assert!(!is_safe_github_component(""));
assert!(!is_safe_github_component("evil/path"));
}
#[test]
fn safe_git_ref_accepts_normal_refs() {
assert!(is_safe_git_ref("HEAD"));
assert!(is_safe_git_ref("v1.0.0"));
assert!(is_safe_git_ref("1.0.0"));
assert!(is_safe_git_ref("main"));
assert!(is_safe_git_ref("heads/main"));
assert!(is_safe_git_ref("my-package_v1.2.3"));
}
#[test]
fn safe_git_ref_rejects_malicious_input() {
assert!(!is_safe_git_ref("../../../etc/passwd"));
assert!(!is_safe_git_ref("http://evil.com"));
assert!(!is_safe_git_ref("git@evil.com:foo"));
assert!(!is_safe_git_ref("ref with spaces"));
assert!(!is_safe_git_ref("ref\nnewline"));
assert!(!is_safe_git_ref(""));
}
#[test]
fn parse_rejects_ssrf_payloads_in_owner() {
let v = serde_json::json!("https://github.com/../../../api/v1");
assert!(parse_github_repo(&v).is_none());
}
#[test]
fn parse_rejects_ssrf_payloads_in_repo() {
let v = serde_json::json!("https://github.com/legit/repo@evil.com");
assert!(parse_github_repo(&v).is_none());
}
#[test]
fn parse_rejects_encoded_traversal() {
let v = serde_json::json!("https://github.com/evil..com/repo");
assert!(parse_github_repo(&v).is_none());
}
#[test]
fn parse_rejects_whitespace_in_components() {
let v = serde_json::json!("https://github.com/evil user/repo");
assert!(parse_github_repo(&v).is_none());
}
#[test]
fn parse_accepts_normal_repos_after_validation() {
let cases = vec![
("https://github.com/axios/axios", "axios", "axios"),
("https://github.com/facebook/react.git", "facebook", "react"),
(
"git+https://github.com/lodash/lodash.git",
"lodash",
"lodash",
),
("github:chalk/chalk", "chalk", "chalk"),
(
"git@github.com:expressjs/express.git",
"expressjs",
"express",
),
];
for (url, expected_owner, expected_repo) in cases {
let v = serde_json::json!(url);
let repo = parse_github_repo(&v).unwrap_or_else(|| panic!("Expected to parse {url}"));
assert_eq!(repo.owner, expected_owner, "owner mismatch for {url}");
assert_eq!(repo.repo, expected_repo, "repo mismatch for {url}");
}
}
}