use std::collections::{HashSet, VecDeque};
use std::io::BufRead;
use std::path::{Path, PathBuf};
use std::process::Command;
use gix::objs::Kind;
use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
const MAX_GIT_TOTAL_BYTES: usize = 256 * 1024 * 1024;
const MAX_GIT_BLOB_BYTES: u64 = 10 * 1024 * 1024;
const MAX_GIT_CHUNKS: usize = 500_000;
pub struct GitSource {
repo_path: PathBuf,
max_commits: Option<usize>,
}
impl GitSource {
pub fn new(repo_path: PathBuf) -> Self {
Self {
repo_path,
max_commits: None,
}
}
pub fn with_max_commits(mut self, n: usize) -> Self {
self.max_commits = Some(n);
self
}
}
impl Source for GitSource {
fn name(&self) -> &str {
"git"
}
fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
match stream_git_blobs(&self.repo_path, self.max_commits) {
Ok(iter) => Box::new(iter),
Err(e) => Box::new(std::iter::once(Err(e))),
}
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
fn stream_git_blobs(
repo_path: &Path,
max_commits: Option<usize>,
) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
let repo_arg = super::validate_repo_path(repo_path)?;
let mut log_cmd = Command::new("git");
log_cmd.args(["-C", &repo_arg, "log", "--format=%H %an"]);
if let Some(limit) = max_commits {
log_cmd.args(["--max-count", &limit.to_string()]);
}
log_cmd.arg("--end-of-options");
log_cmd.stdout(std::process::Stdio::piped());
let mut log_child = log_cmd.spawn().map_err(SourceError::Io)?;
let log_stdout = log_child
.stdout
.take()
.ok_or_else(|| SourceError::Io(std::io::Error::other("missing log stdout")))?;
let mut log_lines = std::io::BufReader::new(log_stdout).lines();
let repo_owned = repo_path.to_path_buf();
let mut current_tree_blobs: VecDeque<Chunk> = VecDeque::new();
let mut seen_blobs: HashSet<gix::ObjectId> = HashSet::new();
let mut total_bytes = 0usize;
let mut chunk_count = 0usize;
let mut done = false;
Ok(std::iter::from_fn(move || {
if done {
return None;
}
loop {
if let Some(chunk) = current_tree_blobs.pop_front() {
return Some(Ok(chunk));
}
if total_bytes >= MAX_GIT_TOTAL_BYTES || chunk_count >= MAX_GIT_CHUNKS {
done = true;
return None;
}
let line = match log_lines.next() {
Some(Ok(l)) => l,
Some(Err(e)) => {
done = true;
return Some(Err(SourceError::Io(e)));
}
None => {
done = true;
return None;
}
};
let parts: Vec<&str> = line.splitn(2, ' ').collect();
if parts.len() < 2 {
continue;
}
let commit_id = parts[0];
let author = parts[1];
let Ok(repo) = gix::open(&repo_owned) else {
continue;
};
let Ok(id) = gix::ObjectId::from_hex(commit_id.as_bytes()) else {
continue;
};
let Ok(obj) = repo.find_object(id) else {
continue;
};
let Ok(commit) = obj.try_into_commit() else {
continue;
};
let Ok(tree) = commit.tree() else {
continue;
};
let mut chunks = Vec::new();
collect_tree_blobs_to_vec(
&repo,
&tree,
commit_id,
author,
&mut seen_blobs,
&mut chunks,
&mut total_bytes,
&mut chunk_count,
b"",
);
if !chunks.is_empty() {
current_tree_blobs.extend(chunks);
if let Some(chunk) = current_tree_blobs.pop_front() {
return Some(Ok(chunk));
}
}
}
}))
}
fn collect_tree_blobs_to_vec(
repo: &gix::Repository,
tree: &gix::Tree<'_>,
commit_id: &str,
author: &str,
seen_blobs: &mut HashSet<gix::ObjectId>,
chunks: &mut Vec<Chunk>,
total_bytes: &mut usize,
chunk_count: &mut usize,
prefix: &[u8],
) {
if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
return;
}
for entry_ref in tree.iter() {
if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
return;
}
let entry = match entry_ref {
Ok(e) => e,
Err(_) => continue,
};
let oid = entry.oid().to_owned();
let filepath = if prefix.is_empty() {
entry.filename().to_vec()
} else {
let mut p = prefix.to_vec();
p.push(b'/');
p.extend_from_slice(entry.filename());
p
};
let mode = entry.mode();
if mode.is_tree() {
if let Ok(obj) = repo.find_object(oid)
&& let Ok(subtree) = obj.try_into_tree()
{
collect_tree_blobs_to_vec(
repo,
&subtree,
commit_id,
author,
seen_blobs,
chunks,
total_bytes,
chunk_count,
&filepath,
);
}
continue;
}
if !mode.is_blob() {
continue;
}
if !seen_blobs.insert(oid) {
continue;
}
let header = match repo.find_header(oid) {
Ok(header) => header,
Err(_) => continue,
};
if header.kind() != Kind::Blob || header.size() > MAX_GIT_BLOB_BYTES {
continue;
}
let obj = match repo.find_object(oid) {
Ok(o) => o,
Err(_) => continue,
};
let file_text = match std::str::from_utf8(&obj.data) {
Ok(text) => text.to_string(),
Err(_) => continue,
};
let path = String::from_utf8_lossy(&filepath).to_string();
*total_bytes = total_bytes.saturating_add(file_text.len());
*chunk_count += 1;
chunks.push(Chunk {
data: file_text,
metadata: ChunkMetadata {
source_type: "git".into(),
path: Some(path),
commit: Some(commit_id.to_string()),
author: Some(author.to_string()),
date: None,
},
});
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn git_source_name() {
let source = GitSource::new(PathBuf::from("/tmp"));
assert_eq!(source.name(), "git");
}
#[test]
fn git_source_with_max_commits() {
let source = GitSource::new(PathBuf::from("/tmp")).with_max_commits(100);
assert_eq!(source.max_commits, Some(100));
}
#[test]
fn git_source_default_no_commit_limit() {
let source = GitSource::new(PathBuf::from("/tmp"));
assert!(source.max_commits.is_none());
}
}