keyhog-sources 0.1.0

Pluggable input sources: filesystem, git history, stdin, s3
Documentation
//! Git repository source: scans repository commits and extracts text blobs with
//! `gix`, stopping once the in-memory byte cap is reached.

use std::collections::HashSet;
use std::path::PathBuf;

use gix::objs::Kind;
use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};

/// Maximum total in-memory bytes for all git blob content.
/// 256 MiB covers large monorepos without OOM.
const MAX_GIT_TOTAL_BYTES: usize = 256 * 1024 * 1024;

/// Maximum size of a single git blob. Larger objects (binaries, vendor bundles)
/// are skipped entirely — secrets almost never appear in 10+ MiB files.
const MAX_GIT_BLOB_BYTES: u64 = 10 * 1024 * 1024;

/// Maximum number of chunks the git source can produce.
/// Guards against repos with millions of tiny files where the byte limit alone
/// wouldn't cap memory: each chunk carries ~200 bytes of metadata overhead,
/// so 500K chunks × 200B = ~100 MB metadata ceiling.
const MAX_GIT_CHUNKS: usize = 500_000;

/// Scans git history: traverses commits and extracts text blob contents.
///
/// # Examples
///
/// ```rust
/// use keyhog_core::Source;
/// use keyhog_sources::GitSource;
/// use std::path::PathBuf;
///
/// let source = GitSource::new(PathBuf::from(".")).with_max_commits(10);
/// assert_eq!(source.name(), "git");
/// ```
pub struct GitSource {
    repo_path: PathBuf,
    max_commits: Option<usize>,
}

impl GitSource {
    /// Create a source that traverses a git repository.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use keyhog_core::Source;
    /// use keyhog_sources::GitSource;
    /// use std::path::PathBuf;
    ///
    /// let source = GitSource::new(PathBuf::from("."));
    /// assert_eq!(source.name(), "git");
    /// ```
    pub fn new(repo_path: PathBuf) -> Self {
        Self {
            repo_path,
            max_commits: None,
        }
    }

    /// Limit how many commits are traversed from `HEAD`.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use keyhog_core::Source;
    /// use keyhog_sources::GitSource;
    /// use std::path::PathBuf;
    ///
    /// let source = GitSource::new(PathBuf::from(".")).with_max_commits(5);
    /// assert_eq!(source.name(), "git");
    /// ```
    pub fn with_max_commits(mut self, n: usize) -> Self {
        self.max_commits = Some(n);
        self
    }
}

impl Source for GitSource {
    fn name(&self) -> &str {
        "git"
    }

    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
        let chunk_collection = collect_git_chunks(&self.repo_path, self.max_commits);
        match chunk_collection {
            Ok(chunks) => Box::new(chunks.into_iter().map(Ok)),
            Err(e) => Box::new(std::iter::once(Err(e))),
        }
    }
}

fn collect_git_chunks(
    repo_path: &std::path::Path,
    max_commits: Option<usize>,
) -> Result<Vec<Chunk>, SourceError> {
    let repo = gix::open(repo_path).map_err(|e| SourceError::Git(e.to_string()))?;

    let head = repo
        .head_commit()
        .map_err(|e| SourceError::Git(format!("failed to get HEAD: {}", e)))?;

    let ancestors = head
        .ancestors()
        .all()
        .map_err(|e| SourceError::Git(format!("failed to traverse: {}", e)))?;

    let mut chunks = Vec::new();
    let mut seen_blobs: HashSet<gix::ObjectId> = HashSet::new();
    let mut total_bytes = 0usize;
    let mut chunk_count = 0usize;
    let mut traversal = BlobTraversal {
        seen_blobs: &mut seen_blobs,
        chunks: &mut chunks,
        total_bytes: &mut total_bytes,
        chunk_count: &mut chunk_count,
    };
    for (count, info) in ancestors.enumerate() {
        if let Some(max) = max_commits
            && count >= max
        {
            break;
        }

        let info = match info {
            Ok(i) => i,
            Err(e) => {
                tracing::debug!("failed to traverse git commit: {}", e);
                continue;
            }
        };

        let obj = match info.id().object() {
            Ok(o) => o,
            Err(_) => continue,
        };
        let commit: gix::Commit<'_> = match obj.try_into_commit() {
            Ok(c) => c,
            Err(_) => continue,
        };

        let commit_id = info.id().to_string();
        let author = commit
            .author()
            .map(|a| a.name.to_string())
            .unwrap_or_default();

        let tree = match commit.tree() {
            Ok(t) => t,
            Err(_) => continue,
        };

        collect_tree_blobs(&repo, &tree, &commit_id, &author, &mut traversal, b"");
        if *traversal.total_bytes >= MAX_GIT_TOTAL_BYTES {
            tracing::warn!(
                "git history scan: reached {} byte in-memory limit",
                MAX_GIT_TOTAL_BYTES
            );
            break;
        }
        if *traversal.chunk_count >= MAX_GIT_CHUNKS {
            tracing::warn!("git history scan: reached {} chunk limit", MAX_GIT_CHUNKS);
            break;
        }
    }

    Ok(chunks)
}

struct BlobTraversal<'a> {
    seen_blobs: &'a mut HashSet<gix::ObjectId>,
    chunks: &'a mut Vec<Chunk>,
    total_bytes: &'a mut usize,
    chunk_count: &'a mut usize,
}

fn collect_tree_blobs(
    repo: &gix::Repository,
    tree: &gix::Tree<'_>,
    commit_id: &str,
    author: &str,
    traversal: &mut BlobTraversal<'_>,
    prefix: &[u8],
) {
    if *traversal.total_bytes >= MAX_GIT_TOTAL_BYTES || *traversal.chunk_count >= MAX_GIT_CHUNKS {
        return;
    }
    for entry_ref in tree.iter() {
        if *traversal.total_bytes >= MAX_GIT_TOTAL_BYTES || *traversal.chunk_count >= MAX_GIT_CHUNKS
        {
            return;
        }
        let entry = match entry_ref {
            Ok(e) => e,
            Err(_) => continue,
        };

        let oid = entry.oid().to_owned();

        let filepath = if prefix.is_empty() {
            entry.filename().to_vec()
        } else {
            let mut p = prefix.to_vec();
            p.push(b'/');
            p.extend_from_slice(entry.filename());
            p
        };

        let mode = entry.mode();

        if mode.is_tree() {
            if let Ok(obj) = repo.find_object(oid)
                && let Ok(subtree) = obj.try_into_tree()
            {
                collect_tree_blobs(repo, &subtree, commit_id, author, traversal, &filepath);
            }
            continue;
        }

        if !mode.is_blob() {
            continue;
        }

        if !traversal.seen_blobs.insert(oid) {
            continue;
        }

        let header = match repo.find_header(oid) {
            Ok(header) => header,
            Err(_) => continue,
        };
        if header.kind() != Kind::Blob || header.size() > MAX_GIT_BLOB_BYTES {
            continue;
        }

        let obj = match repo.find_object(oid) {
            Ok(o) => o,
            Err(_) => continue,
        };

        let file_text = match std::str::from_utf8(&obj.data) {
            Ok(text) => text.to_string(),
            Err(_) => continue,
        };

        let path = String::from_utf8_lossy(&filepath).to_string();
        *traversal.total_bytes = traversal.total_bytes.saturating_add(file_text.len());
        *traversal.chunk_count += 1;

        traversal.chunks.push(Chunk {
            data: file_text,
            metadata: ChunkMetadata {
                source_type: "git".into(),
                path: Some(path),
                commit: Some(commit_id.to_string()),
                author: Some(author.to_string()),
                date: None,
            },
        });
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn git_source_name() {
        let source = GitSource::new(PathBuf::from("/tmp"));
        assert_eq!(source.name(), "git");
    }

    #[test]
    fn git_source_with_max_commits() {
        let source = GitSource::new(PathBuf::from("/tmp")).with_max_commits(100);
        assert_eq!(source.max_commits, Some(100));
    }

    #[test]
    fn git_source_default_no_commit_limit() {
        let source = GitSource::new(PathBuf::from("/tmp"));
        assert!(source.max_commits.is_none());
    }
}