keyhog-sources 0.2.1

Pluggable input sources: filesystem, git history, stdin, s3
Documentation
//! Git repository source: scans repository commits and extracts text blobs with
//! `gix`, stopping once the in-memory byte cap is reached.

use std::collections::{HashSet, VecDeque};
use std::io::BufRead;
use std::path::{Path, PathBuf};
use std::process::Command;

use gix::objs::Kind;
use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};

/// Maximum total in-memory bytes for all git blob content.
/// 256 MiB covers large monorepos without OOM.
const MAX_GIT_TOTAL_BYTES: usize = 256 * 1024 * 1024;

/// Maximum size of a single git blob. Larger objects (binaries, vendor bundles)
/// are skipped entirely — secrets almost never appear in 10+ MiB files.
const MAX_GIT_BLOB_BYTES: u64 = 10 * 1024 * 1024;

/// Maximum number of chunks the git source can produce.
/// Guards against repos with millions of tiny files where the byte limit alone
/// wouldn't cap memory: each chunk carries ~200 bytes of metadata overhead,
/// so 500K chunks × 200B = ~100 MB metadata ceiling.
const MAX_GIT_CHUNKS: usize = 500_000;

/// Scans git history: traverses commits and extracts text blob contents.
///
/// # Examples
///
/// ```rust
/// use keyhog_core::Source;
/// use keyhog_sources::GitSource;
/// use std::path::PathBuf;
///
/// let source = GitSource::new(PathBuf::from(".")).with_max_commits(10);
/// assert_eq!(source.name(), "git");
/// ```
pub struct GitSource {
    repo_path: PathBuf,
    max_commits: Option<usize>,
}

impl GitSource {
    /// Create a source that traverses a git repository.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use keyhog_core::Source;
    /// use keyhog_sources::GitSource;
    /// use std::path::PathBuf;
    ///
    /// let source = GitSource::new(PathBuf::from("."));
    /// assert_eq!(source.name(), "git");
    /// ```
    pub fn new(repo_path: PathBuf) -> Self {
        Self {
            repo_path,
            max_commits: None,
        }
    }

    /// Limit how many commits are traversed from `HEAD`.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use keyhog_core::Source;
    /// use keyhog_sources::GitSource;
    /// use std::path::PathBuf;
    ///
    /// let source = GitSource::new(PathBuf::from(".")).with_max_commits(5);
    /// assert_eq!(source.name(), "git");
    /// ```
    pub fn with_max_commits(mut self, n: usize) -> Self {
        self.max_commits = Some(n);
        self
    }
}

impl Source for GitSource {
    fn name(&self) -> &str {
        "git"
    }

    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
        match stream_git_blobs(&self.repo_path, self.max_commits) {
            Ok(iter) => Box::new(iter),
            Err(e) => Box::new(std::iter::once(Err(e))),
        }
    }
    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

fn stream_git_blobs(
    repo_path: &Path,
    max_commits: Option<usize>,
) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
    let repo_arg = super::validate_repo_path(repo_path)?;

    // Get commit hashes from HEAD
    let mut log_cmd = Command::new("git");
    log_cmd.args(["-C", &repo_arg, "log", "--format=%H %an"]);
    if let Some(limit) = max_commits {
        log_cmd.args(["--max-count", &limit.to_string()]);
    }
    log_cmd.arg("--end-of-options");

    log_cmd.stdout(std::process::Stdio::piped());
    let mut log_child = log_cmd.spawn().map_err(SourceError::Io)?;
    let log_stdout = log_child
        .stdout
        .take()
        .ok_or_else(|| SourceError::Io(std::io::Error::other("missing log stdout")))?;
    let mut log_lines = std::io::BufReader::new(log_stdout).lines();

    let repo_owned = repo_path.to_path_buf();
    let mut current_tree_blobs: VecDeque<Chunk> = VecDeque::new();
    let mut seen_blobs: HashSet<gix::ObjectId> = HashSet::new();
    let mut total_bytes = 0usize;
    let mut chunk_count = 0usize;
    let mut done = false;

    Ok(std::iter::from_fn(move || {
        if done {
            return None;
        }

        loop {
            if let Some(chunk) = current_tree_blobs.pop_front() {
                return Some(Ok(chunk));
            }

            if total_bytes >= MAX_GIT_TOTAL_BYTES || chunk_count >= MAX_GIT_CHUNKS {
                done = true;
                return None;
            }

            let line = match log_lines.next() {
                Some(Ok(l)) => l,
                Some(Err(e)) => {
                    done = true;
                    return Some(Err(SourceError::Io(e)));
                }
                None => {
                    done = true;
                    return None;
                }
            };

            let parts: Vec<&str> = line.splitn(2, ' ').collect();
            if parts.len() < 2 {
                continue;
            }
            let commit_id = parts[0];
            let author = parts[1];

            let Ok(repo) = gix::open(&repo_owned) else {
                continue;
            };
            let Ok(id) = gix::ObjectId::from_hex(commit_id.as_bytes()) else {
                continue;
            };
            let Ok(obj) = repo.find_object(id) else {
                continue;
            };
            let Ok(commit) = obj.try_into_commit() else {
                continue;
            };
            let Ok(tree) = commit.tree() else {
                continue;
            };

            let mut chunks = Vec::new();
            collect_tree_blobs_to_vec(
                &repo,
                &tree,
                commit_id,
                author,
                &mut seen_blobs,
                &mut chunks,
                &mut total_bytes,
                &mut chunk_count,
                b"",
            );

            if !chunks.is_empty() {
                current_tree_blobs.extend(chunks);
                if let Some(chunk) = current_tree_blobs.pop_front() {
                    return Some(Ok(chunk));
                }
            }
        }
    }))
}

fn collect_tree_blobs_to_vec(
    repo: &gix::Repository,
    tree: &gix::Tree<'_>,
    commit_id: &str,
    author: &str,
    seen_blobs: &mut HashSet<gix::ObjectId>,
    chunks: &mut Vec<Chunk>,
    total_bytes: &mut usize,
    chunk_count: &mut usize,
    prefix: &[u8],
) {
    if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
        return;
    }
    for entry_ref in tree.iter() {
        if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
            return;
        }
        let entry = match entry_ref {
            Ok(e) => e,
            Err(_) => continue,
        };

        let oid = entry.oid().to_owned();

        let filepath = if prefix.is_empty() {
            entry.filename().to_vec()
        } else {
            let mut p = prefix.to_vec();
            p.push(b'/');
            p.extend_from_slice(entry.filename());
            p
        };

        let mode = entry.mode();

        if mode.is_tree() {
            if let Ok(obj) = repo.find_object(oid)
                && let Ok(subtree) = obj.try_into_tree()
            {
                collect_tree_blobs_to_vec(
                    repo,
                    &subtree,
                    commit_id,
                    author,
                    seen_blobs,
                    chunks,
                    total_bytes,
                    chunk_count,
                    &filepath,
                );
            }
            continue;
        }

        if !mode.is_blob() {
            continue;
        }

        if !seen_blobs.insert(oid) {
            continue;
        }

        let header = match repo.find_header(oid) {
            Ok(header) => header,
            Err(_) => continue,
        };
        if header.kind() != Kind::Blob || header.size() > MAX_GIT_BLOB_BYTES {
            continue;
        }

        let obj = match repo.find_object(oid) {
            Ok(o) => o,
            Err(_) => continue,
        };

        let file_text = match std::str::from_utf8(&obj.data) {
            Ok(text) => text.to_string(),
            Err(_) => continue,
        };

        let path = String::from_utf8_lossy(&filepath).to_string();
        *total_bytes = total_bytes.saturating_add(file_text.len());
        *chunk_count += 1;

        chunks.push(Chunk {
            data: file_text,
            metadata: ChunkMetadata {
                source_type: "git".into(),
                path: Some(path),
                commit: Some(commit_id.to_string()),
                author: Some(author.to_string()),
                date: None,
            },
        });
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn git_source_name() {
        let source = GitSource::new(PathBuf::from("/tmp"));
        assert_eq!(source.name(), "git");
    }

    #[test]
    fn git_source_with_max_commits() {
        let source = GitSource::new(PathBuf::from("/tmp")).with_max_commits(100);
        assert_eq!(source.max_commits, Some(100));
    }

    #[test]
    fn git_source_default_no_commit_limit() {
        let source = GitSource::new(PathBuf::from("/tmp"));
        assert!(source.max_commits.is_none());
    }
}