Skip to main content

keyhog_sources/git/
source.rs

1//! Git repository source: scans repository commits and extracts text blobs with
2//! `gix`, stopping once the in-memory byte cap is reached.
3
4use std::collections::{HashSet, VecDeque};
5use std::io::BufRead;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8
9use gix::objs::Kind;
10use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
11
12/// Maximum total in-memory bytes for all git blob content.
13/// 256 MiB covers large monorepos without OOM.
14const MAX_GIT_TOTAL_BYTES: usize = 256 * 1024 * 1024;
15
16/// Maximum size of a single git blob. Larger objects (binaries, vendor bundles)
17/// are skipped entirely — secrets almost never appear in 10+ MiB files.
18const MAX_GIT_BLOB_BYTES: u64 = 10 * 1024 * 1024;
19
20/// Maximum number of chunks the git source can produce.
21/// Guards against repos with millions of tiny files where the byte limit alone
22/// wouldn't cap memory: each chunk carries ~200 bytes of metadata overhead,
23/// so 500K chunks × 200B = ~100 MB metadata ceiling.
24const MAX_GIT_CHUNKS: usize = 500_000;
25
26/// Scans git history: traverses commits and extracts text blob contents.
27///
28/// # Examples
29///
30/// ```rust
31/// use keyhog_core::Source;
32/// use keyhog_sources::GitSource;
33/// use std::path::PathBuf;
34///
35/// let source = GitSource::new(PathBuf::from(".")).with_max_commits(10);
36/// assert_eq!(source.name(), "git");
37/// ```
38pub struct GitSource {
39    repo_path: PathBuf,
40    max_commits: Option<usize>,
41}
42
43impl GitSource {
44    /// Create a source that traverses a git repository.
45    ///
46    /// # Examples
47    ///
48    /// ```rust
49    /// use keyhog_core::Source;
50    /// use keyhog_sources::GitSource;
51    /// use std::path::PathBuf;
52    ///
53    /// let source = GitSource::new(PathBuf::from("."));
54    /// assert_eq!(source.name(), "git");
55    /// ```
56    pub fn new(repo_path: PathBuf) -> Self {
57        Self {
58            repo_path,
59            max_commits: None,
60        }
61    }
62
63    /// Limit how many commits are traversed from `HEAD`.
64    ///
65    /// # Examples
66    ///
67    /// ```rust
68    /// use keyhog_core::Source;
69    /// use keyhog_sources::GitSource;
70    /// use std::path::PathBuf;
71    ///
72    /// let source = GitSource::new(PathBuf::from(".")).with_max_commits(5);
73    /// assert_eq!(source.name(), "git");
74    /// ```
75    pub fn with_max_commits(mut self, n: usize) -> Self {
76        self.max_commits = Some(n);
77        self
78    }
79}
80
81impl Source for GitSource {
82    fn name(&self) -> &str {
83        "git"
84    }
85
86    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
87        match stream_git_blobs(&self.repo_path, self.max_commits) {
88            Ok(iter) => Box::new(iter),
89            Err(e) => Box::new(std::iter::once(Err(e))),
90        }
91    }
92    fn as_any(&self) -> &dyn std::any::Any {
93        self
94    }
95}
96
97fn stream_git_blobs(
98    repo_path: &Path,
99    max_commits: Option<usize>,
100) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
101    let repo_arg = super::validate_repo_path(repo_path)?;
102
103    // Get commit hashes from ALL refs — branches, tags, dangling commits.
104    // The previous version walked HEAD ancestry only, silently missing
105    // secrets in feature branches, deleted-but-tagged history, and merge-only
106    // commits. See audit release-2026-04-26 sources/git/source.rs:104.
107    let mut log_cmd = Command::new(super::git_bin()?);
108    log_cmd.args([
109        "-C",
110        &repo_arg,
111        "log",
112        "--all",
113        "--branches",
114        "--tags",
115        "-m", // emit patches for merge commits ("evil merges")
116        "--format=%H %an",
117    ]);
118    if let Some(limit) = max_commits {
119        log_cmd.args(["--max-count", &limit.to_string()]);
120    }
121    log_cmd.arg("--end-of-options");
122
123    log_cmd.stdout(std::process::Stdio::piped());
124    let mut log_child = log_cmd.spawn().map_err(SourceError::Io)?;
125    let log_stdout = log_child
126        .stdout
127        .take()
128        .ok_or_else(|| SourceError::Io(std::io::Error::other("missing log stdout")))?;
129    let mut log_lines = std::io::BufReader::new(log_stdout).lines();
130
131    // Open the gix repo ONCE and reuse it for every commit. The previous
132    // version called `gix::open(&repo_owned)` per-commit which on a 10k-commit
133    // repo opened the repo 10k times — fd churn + IO amplification.
134    let repo_owned = repo_path.to_path_buf();
135    let repo_handle = gix::open(&repo_owned)
136        .map_err(|e| SourceError::Io(std::io::Error::other(format!("gix open: {e}"))))?;
137    // Snapshot every blob OID reachable from HEAD's tree. Used to label
138    // emitted chunks as "git/head" (live in HEAD) vs "git/history"
139    // (only present in older commits). The downstream scorer downgrades
140    // the severity of `git/history` findings — a credential a developer
141    // already removed from HEAD is still a leak, but less urgent than
142    // one currently grep-able from main. Cheap: one tree walk at most.
143    let head_blobs = collect_head_blob_set(&repo_handle).unwrap_or_default();
144    let mut current_tree_blobs: VecDeque<Chunk> = VecDeque::new();
145    let mut seen_blobs: HashSet<gix::ObjectId> = HashSet::new();
146    let mut total_bytes = 0usize;
147    let mut chunk_count = 0usize;
148    let mut done = false;
149
150    Ok(std::iter::from_fn(move || {
151        if done {
152            return None;
153        }
154
155        loop {
156            if let Some(chunk) = current_tree_blobs.pop_front() {
157                return Some(Ok(chunk));
158            }
159
160            if total_bytes >= MAX_GIT_TOTAL_BYTES || chunk_count >= MAX_GIT_CHUNKS {
161                done = true;
162                return None;
163            }
164
165            let line = match log_lines.next() {
166                Some(Ok(l)) => l,
167                Some(Err(e)) => {
168                    done = true;
169                    return Some(Err(SourceError::Io(e)));
170                }
171                None => {
172                    done = true;
173                    return None;
174                }
175            };
176
177            let parts: Vec<&str> = line.splitn(2, ' ').collect();
178            if parts.len() < 2 {
179                continue;
180            }
181            let commit_id = parts[0];
182            let author = parts[1];
183
184            let repo = &repo_handle;
185            let Ok(id) = gix::ObjectId::from_hex(commit_id.as_bytes()) else {
186                continue;
187            };
188            let Ok(obj) = repo.find_object(id) else {
189                continue;
190            };
191            let Ok(commit) = obj.try_into_commit() else {
192                continue;
193            };
194            let Ok(tree) = commit.tree() else {
195                continue;
196            };
197
198            let mut chunks = Vec::new();
199            collect_tree_blobs_to_vec(
200                repo,
201                &tree,
202                commit_id,
203                author,
204                &head_blobs,
205                &mut seen_blobs,
206                &mut chunks,
207                &mut total_bytes,
208                &mut chunk_count,
209                b"",
210            );
211
212            if !chunks.is_empty() {
213                current_tree_blobs.extend(chunks);
214                if let Some(chunk) = current_tree_blobs.pop_front() {
215                    return Some(Ok(chunk));
216                }
217            }
218        }
219    }))
220}
221
222fn collect_tree_blobs_to_vec(
223    repo: &gix::Repository,
224    tree: &gix::Tree<'_>,
225    commit_id: &str,
226    author: &str,
227    head_blobs: &HashSet<gix::ObjectId>,
228    seen_blobs: &mut HashSet<gix::ObjectId>,
229    chunks: &mut Vec<Chunk>,
230    total_bytes: &mut usize,
231    chunk_count: &mut usize,
232    prefix: &[u8],
233) {
234    if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
235        return;
236    }
237    for entry_ref in tree.iter() {
238        if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
239            return;
240        }
241        let entry = match entry_ref {
242            Ok(e) => e,
243            Err(_) => continue,
244        };
245
246        let oid = entry.oid().to_owned();
247
248        let filepath = if prefix.is_empty() {
249            entry.filename().to_vec()
250        } else {
251            let mut p = prefix.to_vec();
252            p.push(b'/');
253            p.extend_from_slice(entry.filename());
254            p
255        };
256
257        let mode = entry.mode();
258
259        if mode.is_tree() {
260            if let Ok(obj) = repo.find_object(oid) {
261                if let Ok(subtree) = obj.try_into_tree() {
262                    collect_tree_blobs_to_vec(
263                        repo,
264                        &subtree,
265                        commit_id,
266                        author,
267                        head_blobs,
268                        seen_blobs,
269                        chunks,
270                        total_bytes,
271                        chunk_count,
272                        &filepath,
273                    );
274                }
275            }
276            continue;
277        }
278
279        if !mode.is_blob() {
280            continue;
281        }
282
283        if !seen_blobs.insert(oid) {
284            continue;
285        }
286
287        let header = match repo.find_header(oid) {
288            Ok(header) => header,
289            Err(_) => continue,
290        };
291        if header.kind() != Kind::Blob || header.size() > MAX_GIT_BLOB_BYTES {
292            continue;
293        }
294
295        let obj = match repo.find_object(oid) {
296            Ok(o) => o,
297            Err(_) => continue,
298        };
299
300        let file_text = match std::str::from_utf8(&obj.data) {
301            Ok(text) => text.to_string(),
302            Err(_) => continue,
303        };
304
305        let path = String::from_utf8_lossy(&filepath).to_string();
306        *total_bytes = total_bytes.saturating_add(file_text.len());
307        *chunk_count += 1;
308
309        let in_head = head_blobs.contains(&oid);
310        chunks.push(Chunk {
311            data: file_text.into(),
312            metadata: ChunkMetadata {
313                base_offset: 0,
314                source_type: if in_head { "git/head" } else { "git/history" }.into(),
315                path: Some(path),
316                commit: Some(commit_id.to_string()),
317                author: Some(author.to_string()),
318                date: None,
319                mtime_ns: None,
320                size_bytes: None,
321            },
322        });
323    }
324}
325
326/// Walk HEAD's tree and collect every blob OID reachable from it.
327///
328/// Returns an empty set if HEAD doesn't resolve (detached, empty repo, or
329/// transient I/O error). The caller's behavior in that case: every blob is
330/// labeled `git/history` since we cannot prove it sits in HEAD — safer than
331/// the inverse, which would suppress severity downgrades for genuine
332/// historical leaks.
333fn collect_head_blob_set(repo: &gix::Repository) -> Option<HashSet<gix::ObjectId>> {
334    let head = repo.head().ok()?;
335    let head_id = head.try_into_peeled_id().ok().flatten()?;
336    let commit = repo.find_object(head_id).ok()?.try_into_commit().ok()?;
337    let tree = commit.tree().ok()?;
338    let mut out = HashSet::new();
339    walk_tree_for_blobs(repo, &tree, &mut out);
340    Some(out)
341}
342
343fn walk_tree_for_blobs(
344    repo: &gix::Repository,
345    tree: &gix::Tree<'_>,
346    out: &mut HashSet<gix::ObjectId>,
347) {
348    for entry_ref in tree.iter() {
349        let Ok(entry) = entry_ref else { continue };
350        let oid = entry.oid().to_owned();
351        let mode = entry.mode();
352        if mode.is_tree() {
353            if let Ok(obj) = repo.find_object(oid) {
354                if let Ok(subtree) = obj.try_into_tree() {
355                    walk_tree_for_blobs(repo, &subtree, out);
356                }
357            }
358        } else if mode.is_blob() {
359            out.insert(oid);
360        }
361    }
362}