use crate::detectors::{ContextAnalyzer, PatternDetector};
use crate::models::{Finding, Location};
use crate::scan_warn;
use anyhow::{Context, Result};
use git2::{Commit, Diff, DiffOptions, Oid, Repository};
use rayon::prelude::*;
use std::path::PathBuf;
fn commit_timestamp(commit: &Commit) -> chrono::DateTime<chrono::Utc> {
chrono::DateTime::from_timestamp(commit.time().seconds(), 0).unwrap_or_else(|| {
scan_warn!(
"git",
"invalid timestamp {} in commit {}",
commit.time().seconds(),
commit.id()
);
chrono::DateTime::default()
})
}
pub struct GitScanner {
repo_path: PathBuf,
scan_history: bool,
max_depth: Option<usize>,
max_fs_depth: Option<usize>,
entropy_threshold: f64,
since_commit: Option<String>,
commit_range: Option<(String, String)>,
custom_patterns: Vec<crate::config::settings::CustomPattern>,
include_deps: bool,
}
impl GitScanner {
pub fn new(repo_path: PathBuf) -> Self {
Self {
repo_path,
scan_history: true,
max_depth: None,
max_fs_depth: None,
entropy_threshold: 3.5,
since_commit: None,
commit_range: None,
custom_patterns: Vec::new(),
include_deps: false,
}
}
pub fn with_include_deps(mut self, include: bool) -> Self {
self.include_deps = include;
self
}
pub fn with_custom_patterns(
mut self,
patterns: Vec<crate::config::settings::CustomPattern>,
) -> Self {
self.custom_patterns = patterns;
self
}
pub fn with_history(mut self, scan_history: bool) -> Self {
self.scan_history = scan_history;
self
}
pub fn with_max_depth(mut self, depth: usize) -> Self {
self.max_depth = Some(depth);
self
}
pub fn with_max_fs_depth(mut self, depth: usize) -> Self {
self.max_fs_depth = Some(depth);
self
}
pub fn with_entropy_threshold(mut self, threshold: f64) -> Self {
self.entropy_threshold = threshold;
self
}
pub fn with_since_commit(mut self, commit: String) -> Self {
self.since_commit = Some(commit);
self
}
pub fn with_commit_range(mut self, from: String, to: String) -> Self {
self.commit_range = Some((from, to));
self
}
pub fn scan(&self) -> Result<Vec<Finding>> {
let repo = Repository::open(&self.repo_path).context("Failed to open git repository")?;
let mut findings = Vec::new();
if let Some((ref from, ref to)) = self.commit_range {
let range_findings = self.scan_commit_range(&repo, from, to)?;
findings.extend(range_findings);
return Ok(findings);
}
findings.extend(self.scan_working_directory(&repo)?);
if self.scan_history {
let history_findings = self.scan_git_history(&repo)?;
for hf in history_findings {
let dominated = findings.iter().any(|f: &Finding| {
f.location.file_path == hf.location.file_path
&& f.secret.value == hf.secret.value
});
if !dominated {
findings.push(hf);
}
}
}
Ok(findings)
}
fn scan_git_history(&self, repo: &Repository) -> Result<Vec<Finding>> {
let mut revwalk = repo.revwalk()?;
revwalk.push_head()?;
let max_commits = self.max_depth.unwrap_or(usize::MAX);
let since_oid = if let Some(ref since) = self.since_commit {
let obj = repo
.revparse_single(since)
.with_context(|| format!("Could not resolve commit: {}", since))?;
let oid = obj.id();
revwalk.hide(oid).with_context(|| {
format!("Could not set boundary commit: {}", since)
})?;
Some(oid)
} else {
None
};
let mut commit_oids: Vec<Oid> = Vec::new();
for (commit_count, oid) in revwalk.enumerate() {
if commit_count >= max_commits {
break;
}
let oid = oid?;
commit_oids.push(oid);
}
let repo_path = self.repo_path.clone();
let entropy_threshold = self.entropy_threshold;
let custom_patterns = self.custom_patterns.clone();
let mut findings: Vec<Finding> = commit_oids
.par_iter()
.flat_map(|oid| {
let repo = match Repository::open(&repo_path) {
Ok(r) => r,
Err(e) => {
scan_warn!(
"git",
"could not open repo for commit {}: {}",
oid,
e
);
return Vec::new();
}
};
let commit = match repo.find_commit(*oid) {
Ok(c) => c,
Err(e) => {
scan_warn!("git", "could not find commit {}: {}", oid, e);
return Vec::new();
}
};
match Self::scan_commit_static(
&repo,
&commit,
entropy_threshold,
&custom_patterns,
) {
Ok(f) => f,
Err(e) => {
scan_warn!(
"git",
"error scanning commit {}: {}",
oid,
e
);
Vec::new()
}
}
})
.collect();
if let Some(boundary) = since_oid {
if let Ok(head_ref) = repo.head() {
if let Some(head_oid) = head_ref.target() {
if let (Ok(from_commit), Ok(to_commit)) = (
repo.find_commit(boundary),
repo.find_commit(head_oid),
) {
if let (Ok(from_tree), Ok(to_tree)) =
(from_commit.tree(), to_commit.tree())
{
let mut diff_opts = DiffOptions::new();
if let Ok(cumulative_diff) = repo.diff_tree_to_tree(
Some(&from_tree),
Some(&to_tree),
Some(&mut diff_opts),
) {
let mut cumulative_findings = Vec::new();
let _ = Self::scan_diff_static(
repo,
&cumulative_diff,
&to_commit,
self.entropy_threshold,
&self.custom_patterns,
&mut cumulative_findings,
);
for cf in cumulative_findings {
let dominated = findings.iter().any(|f: &Finding| {
f.location.file_path == cf.location.file_path
&& f.secret.value == cf.secret.value
});
if !dominated {
findings.push(cf);
}
}
}
}
}
}
}
}
Ok(findings)
}
fn scan_commit_range(&self, repo: &Repository, from: &str, to: &str) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let from_obj = repo
.revparse_single(from)
.with_context(|| format!("Could not resolve commit: {}", from))?;
let to_obj = repo
.revparse_single(to)
.with_context(|| format!("Could not resolve commit: {}", to))?;
let mut revwalk = repo.revwalk()?;
revwalk.push(to_obj.id())?;
revwalk.hide(from_obj.id())?;
let max_commits = self.max_depth.unwrap_or(usize::MAX);
for (commit_count, oid) in revwalk.enumerate() {
if commit_count >= max_commits {
break;
}
let oid = oid?;
let commit = repo.find_commit(oid)?;
findings.extend(self.scan_commit(repo, &commit)?);
}
let from_commit = repo.find_commit(from_obj.id())
.with_context(|| format!("Could not find commit object for: {}", from))?;
let to_commit = repo.find_commit(to_obj.id())
.with_context(|| format!("Could not find commit object for: {}", to))?;
let from_tree = from_commit.tree()?;
let to_tree = to_commit.tree()?;
let mut diff_opts = DiffOptions::new();
let cumulative_diff =
repo.diff_tree_to_tree(Some(&from_tree), Some(&to_tree), Some(&mut diff_opts))?;
let mut cumulative_findings = Vec::new();
Self::scan_diff_static(
repo,
&cumulative_diff,
&to_commit,
self.entropy_threshold,
&self.custom_patterns,
&mut cumulative_findings,
)?;
for cf in cumulative_findings {
let dominated = findings.iter().any(|f: &Finding| {
f.location.file_path == cf.location.file_path
&& f.secret.value == cf.secret.value
});
if !dominated {
findings.push(cf);
}
}
Ok(findings)
}
fn scan_commit_static(
repo: &Repository,
commit: &Commit,
entropy_threshold: f64,
custom_patterns: &[crate::config::settings::CustomPattern],
) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let tree = commit.tree()?;
let parent_tree = if commit.parent_count() > 0 {
Some(commit.parent(0)?.tree()?)
} else {
None
};
let mut diff_opts = DiffOptions::new();
let diff = if let Some(parent_tree) = parent_tree {
repo.diff_tree_to_tree(Some(&parent_tree), Some(&tree), Some(&mut diff_opts))?
} else {
repo.diff_tree_to_tree(None, Some(&tree), Some(&mut diff_opts))?
};
Self::scan_diff_static(repo, &diff, commit, entropy_threshold, custom_patterns, &mut findings)?;
Ok(findings)
}
fn scan_commit(&self, repo: &Repository, commit: &Commit) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let tree = commit.tree()?;
let parent_tree = if commit.parent_count() > 0 {
Some(commit.parent(0)?.tree()?)
} else {
None
};
let mut diff_opts = DiffOptions::new();
let diff = if let Some(parent_tree) = parent_tree {
repo.diff_tree_to_tree(Some(&parent_tree), Some(&tree), Some(&mut diff_opts))?
} else {
repo.diff_tree_to_tree(None, Some(&tree), Some(&mut diff_opts))?
};
findings.extend(self.scan_diff(repo, &diff, commit)?);
Ok(findings)
}
fn scan_diff_static(
_repo: &Repository,
diff: &Diff,
commit: &Commit,
entropy_threshold: f64,
custom_patterns: &[crate::config::settings::CustomPattern],
findings: &mut Vec<Finding>,
) -> Result<()> {
let detector = if custom_patterns.is_empty() {
PatternDetector::new()
} else {
PatternDetector::with_custom_patterns(custom_patterns)
};
let commit_ts = commit_timestamp(commit);
diff.foreach(
&mut |delta, _progress| {
let file_path = match delta.new_file().path() {
Some(p) => p,
None => {
scan_warn!(
"git",
"diff delta has no file path in commit {}",
commit.id()
);
return true;
}
};
let file_context = ContextAnalyzer::analyze_file(file_path);
if file_context.is_vendor {
return true;
}
true
},
None,
None,
Some(&mut |_delta, _hunk, line| {
let content = String::from_utf8_lossy(line.content());
if line.origin() == '+' {
let secrets = detector.scan_line(&content, entropy_threshold);
for secret in secrets {
if let Some(path) = _delta.new_file().path() {
let file_context = ContextAnalyzer::analyze_file(path);
let location = Location {
file_path: path.to_path_buf(),
line_number: line.new_lineno().unwrap_or(0) as usize,
column_start: 0,
column_end: content.len(),
commit_hash: Some(commit.id().to_string()),
commit_author: commit.author().name().map(|s| s.to_string()),
commit_date: Some(commit_ts),
};
let context = ContextAnalyzer::build_context(
content.to_string(),
None,
None,
&file_context,
);
let finding = Finding::new(secret, location, context);
findings.push(finding);
}
}
}
true
}),
)?;
Ok(())
}
fn scan_diff(&self, _repo: &Repository, diff: &Diff, commit: &Commit) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let detector = if self.custom_patterns.is_empty() {
PatternDetector::new()
} else {
PatternDetector::with_custom_patterns(&self.custom_patterns)
};
let commit_ts = commit_timestamp(commit);
diff.foreach(
&mut |delta, _progress| {
let file_path = match delta.new_file().path() {
Some(p) => p,
None => {
scan_warn!(
"git",
"diff delta has no file path in commit {}",
commit.id()
);
return true;
}
};
let file_context = ContextAnalyzer::analyze_file(file_path);
if file_context.is_vendor {
return true;
}
true
},
None,
None,
Some(&mut |_delta, _hunk, line| {
let content = String::from_utf8_lossy(line.content());
if line.origin() == '+' {
let secrets = detector.scan_line(&content, self.entropy_threshold);
for secret in secrets {
if let Some(path) = _delta.new_file().path() {
let file_context = ContextAnalyzer::analyze_file(path);
let location = Location {
file_path: path.to_path_buf(),
line_number: line.new_lineno().unwrap_or(0) as usize,
column_start: 0,
column_end: content.len(),
commit_hash: Some(commit.id().to_string()),
commit_author: commit.author().name().map(|s| s.to_string()),
commit_date: Some(commit_ts),
};
let context = ContextAnalyzer::build_context(
content.to_string(),
None,
None,
&file_context,
);
let finding = Finding::new(secret, location, context);
findings.push(finding);
}
}
}
true
}),
)?;
Ok(findings)
}
fn scan_working_directory(&self, repo: &Repository) -> Result<Vec<Finding>> {
let workdir = repo
.workdir()
.context("Repository doesn't have a working directory")?;
let mut filesystem_scanner = crate::scanners::FilesystemScanner::new(workdir.to_path_buf())
.with_entropy_threshold(self.entropy_threshold)
.with_include_deps(self.include_deps);
if !self.custom_patterns.is_empty() {
filesystem_scanner =
filesystem_scanner.with_custom_patterns(self.custom_patterns.clone());
}
if let Some(depth) = self.max_fs_depth {
filesystem_scanner = filesystem_scanner.with_max_fs_depth(depth);
}
filesystem_scanner.scan()
}
pub fn get_commits(&self) -> Result<Vec<CommitInfo>> {
let repo = Repository::open(&self.repo_path)?;
let mut revwalk = repo.revwalk()?;
revwalk.push_head()?;
let mut commits = Vec::new();
for oid in revwalk {
let oid = oid?;
let commit = repo.find_commit(oid)?;
let author_name = commit
.author()
.name()
.unwrap_or("<non-UTF-8 author>")
.to_string();
let message = commit
.message()
.unwrap_or("<non-UTF-8 message>")
.to_string();
commits.push(CommitInfo {
hash: commit.id().to_string(),
author: author_name,
message,
timestamp: commit_timestamp(&commit),
});
}
Ok(commits)
}
}
#[derive(Debug, Clone)]
pub struct CommitInfo {
pub hash: String,
pub author: String,
pub message: String,
pub timestamp: chrono::DateTime<chrono::Utc>,
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::path::Path;
use tempfile::TempDir;
fn create_test_repo() -> Result<(TempDir, Repository)> {
let temp_dir = TempDir::new()?;
let repo = Repository::init(temp_dir.path())?;
let file_path = temp_dir.path().join("test.txt");
fs::write(&file_path, "AWS_KEY=AKIAZ52HGXYRN4WBTEST")?;
let mut index = repo.index()?;
index.add_path(Path::new("test.txt"))?;
index.write()?;
let tree_id = index.write_tree()?;
let sig = git2::Signature::now("Test", "test@example.com")?;
{
let tree = repo.find_tree(tree_id)?;
repo.commit(Some("HEAD"), &sig, &sig, "Initial commit", &tree, &[])?;
}
Ok((temp_dir, repo))
}
#[test]
fn test_git_scanner_creation() {
let scanner = GitScanner::new(PathBuf::from("."));
assert!(scanner.scan_history);
}
#[test]
fn test_git_scanner_finds_secrets_in_repo() -> Result<()> {
let (temp_dir, _repo) = create_test_repo()?;
let scanner = GitScanner::new(temp_dir.path().to_path_buf())
.with_history(true)
.with_entropy_threshold(3.0);
let findings = scanner.scan()?;
assert!(!findings.is_empty(), "Should find secrets in git repo");
Ok(())
}
#[test]
fn test_git_scanner_without_history() -> Result<()> {
let (temp_dir, _repo) = create_test_repo()?;
let scanner = GitScanner::new(temp_dir.path().to_path_buf())
.with_history(false)
.with_entropy_threshold(3.0);
let findings = scanner.scan()?;
assert!(
!findings.is_empty(),
"Should find secrets in working directory even without history scanning"
);
Ok(())
}
#[test]
fn test_git_scanner_with_max_depth() {
let scanner = GitScanner::new(PathBuf::from(".")).with_max_depth(10);
assert_eq!(scanner.max_depth, Some(10));
}
}