keyhog_sources/git/
source.rs1use std::collections::{HashSet, VecDeque};
5use std::io::BufRead;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8
9use gix::objs::Kind;
10use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
11
12const MAX_GIT_TOTAL_BYTES: usize = 256 * 1024 * 1024;
15
16const MAX_GIT_BLOB_BYTES: u64 = 10 * 1024 * 1024;
19
20const MAX_GIT_CHUNKS: usize = 500_000;
25
26pub struct GitSource {
39 repo_path: PathBuf,
40 max_commits: Option<usize>,
41}
42
43impl GitSource {
44 pub fn new(repo_path: PathBuf) -> Self {
57 Self {
58 repo_path,
59 max_commits: None,
60 }
61 }
62
63 pub fn with_max_commits(mut self, n: usize) -> Self {
76 self.max_commits = Some(n);
77 self
78 }
79}
80
81impl Source for GitSource {
82 fn name(&self) -> &str {
83 "git"
84 }
85
86 fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
87 match stream_git_blobs(&self.repo_path, self.max_commits) {
88 Ok(iter) => Box::new(iter),
89 Err(e) => Box::new(std::iter::once(Err(e))),
90 }
91 }
92 fn as_any(&self) -> &dyn std::any::Any {
93 self
94 }
95}
96
97fn stream_git_blobs(
98 repo_path: &Path,
99 max_commits: Option<usize>,
100) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
101 let repo_arg = super::validate_repo_path(repo_path)?;
102
103 let mut log_cmd = Command::new(super::git_bin()?);
108 log_cmd.args([
109 "-C",
110 &repo_arg,
111 "log",
112 "--all",
113 "--branches",
114 "--tags",
115 "-m", "--format=%H %an",
117 ]);
118 if let Some(limit) = max_commits {
119 log_cmd.args(["--max-count", &limit.to_string()]);
120 }
121 log_cmd.arg("--end-of-options");
122
123 log_cmd.stdout(std::process::Stdio::piped());
124 let mut log_child = log_cmd.spawn().map_err(SourceError::Io)?;
125 let log_stdout = log_child
126 .stdout
127 .take()
128 .ok_or_else(|| SourceError::Io(std::io::Error::other("missing log stdout")))?;
129 let mut log_lines = std::io::BufReader::new(log_stdout).lines();
130
131 let repo_owned = repo_path.to_path_buf();
135 let repo_handle = gix::open(&repo_owned)
136 .map_err(|e| SourceError::Io(std::io::Error::other(format!("gix open: {e}"))))?;
137 let head_blobs = collect_head_blob_set(&repo_handle).unwrap_or_default();
144 let mut current_tree_blobs: VecDeque<Chunk> = VecDeque::new();
145 let mut seen_blobs: HashSet<gix::ObjectId> = HashSet::new();
146 let mut total_bytes = 0usize;
147 let mut chunk_count = 0usize;
148 let mut done = false;
149
150 Ok(std::iter::from_fn(move || {
151 if done {
152 return None;
153 }
154
155 loop {
156 if let Some(chunk) = current_tree_blobs.pop_front() {
157 return Some(Ok(chunk));
158 }
159
160 if total_bytes >= MAX_GIT_TOTAL_BYTES || chunk_count >= MAX_GIT_CHUNKS {
161 done = true;
162 return None;
163 }
164
165 let line = match log_lines.next() {
166 Some(Ok(l)) => l,
167 Some(Err(e)) => {
168 done = true;
169 return Some(Err(SourceError::Io(e)));
170 }
171 None => {
172 done = true;
173 return None;
174 }
175 };
176
177 let parts: Vec<&str> = line.splitn(2, ' ').collect();
178 if parts.len() < 2 {
179 continue;
180 }
181 let commit_id = parts[0];
182 let author = parts[1];
183
184 let repo = &repo_handle;
185 let Ok(id) = gix::ObjectId::from_hex(commit_id.as_bytes()) else {
186 continue;
187 };
188 let Ok(obj) = repo.find_object(id) else {
189 continue;
190 };
191 let Ok(commit) = obj.try_into_commit() else {
192 continue;
193 };
194 let Ok(tree) = commit.tree() else {
195 continue;
196 };
197
198 let mut chunks = Vec::new();
199 collect_tree_blobs_to_vec(
200 repo,
201 &tree,
202 commit_id,
203 author,
204 &head_blobs,
205 &mut seen_blobs,
206 &mut chunks,
207 &mut total_bytes,
208 &mut chunk_count,
209 b"",
210 );
211
212 if !chunks.is_empty() {
213 current_tree_blobs.extend(chunks);
214 if let Some(chunk) = current_tree_blobs.pop_front() {
215 return Some(Ok(chunk));
216 }
217 }
218 }
219 }))
220}
221
222fn collect_tree_blobs_to_vec(
223 repo: &gix::Repository,
224 tree: &gix::Tree<'_>,
225 commit_id: &str,
226 author: &str,
227 head_blobs: &HashSet<gix::ObjectId>,
228 seen_blobs: &mut HashSet<gix::ObjectId>,
229 chunks: &mut Vec<Chunk>,
230 total_bytes: &mut usize,
231 chunk_count: &mut usize,
232 prefix: &[u8],
233) {
234 if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
235 return;
236 }
237 for entry_ref in tree.iter() {
238 if *total_bytes >= MAX_GIT_TOTAL_BYTES || *chunk_count >= MAX_GIT_CHUNKS {
239 return;
240 }
241 let entry = match entry_ref {
242 Ok(e) => e,
243 Err(_) => continue,
244 };
245
246 let oid = entry.oid().to_owned();
247
248 let filepath = if prefix.is_empty() {
249 entry.filename().to_vec()
250 } else {
251 let mut p = prefix.to_vec();
252 p.push(b'/');
253 p.extend_from_slice(entry.filename());
254 p
255 };
256
257 let mode = entry.mode();
258
259 if mode.is_tree() {
260 if let Ok(obj) = repo.find_object(oid) {
261 if let Ok(subtree) = obj.try_into_tree() {
262 collect_tree_blobs_to_vec(
263 repo,
264 &subtree,
265 commit_id,
266 author,
267 head_blobs,
268 seen_blobs,
269 chunks,
270 total_bytes,
271 chunk_count,
272 &filepath,
273 );
274 }
275 }
276 continue;
277 }
278
279 if !mode.is_blob() {
280 continue;
281 }
282
283 if !seen_blobs.insert(oid) {
284 continue;
285 }
286
287 let header = match repo.find_header(oid) {
288 Ok(header) => header,
289 Err(_) => continue,
290 };
291 if header.kind() != Kind::Blob || header.size() > MAX_GIT_BLOB_BYTES {
292 continue;
293 }
294
295 let obj = match repo.find_object(oid) {
296 Ok(o) => o,
297 Err(_) => continue,
298 };
299
300 let file_text = match std::str::from_utf8(&obj.data) {
301 Ok(text) => text.to_string(),
302 Err(_) => continue,
303 };
304
305 let path = String::from_utf8_lossy(&filepath).to_string();
306 *total_bytes = total_bytes.saturating_add(file_text.len());
307 *chunk_count += 1;
308
309 let in_head = head_blobs.contains(&oid);
310 chunks.push(Chunk {
311 data: file_text.into(),
312 metadata: ChunkMetadata {
313 base_offset: 0,
314 source_type: if in_head { "git/head" } else { "git/history" }.into(),
315 path: Some(path),
316 commit: Some(commit_id.to_string()),
317 author: Some(author.to_string()),
318 date: None,
319 mtime_ns: None,
320 size_bytes: None,
321 },
322 });
323 }
324}
325
326fn collect_head_blob_set(repo: &gix::Repository) -> Option<HashSet<gix::ObjectId>> {
334 let head = repo.head().ok()?;
335 let head_id = head.try_into_peeled_id().ok().flatten()?;
336 let commit = repo.find_object(head_id).ok()?.try_into_commit().ok()?;
337 let tree = commit.tree().ok()?;
338 let mut out = HashSet::new();
339 walk_tree_for_blobs(repo, &tree, &mut out);
340 Some(out)
341}
342
343fn walk_tree_for_blobs(
344 repo: &gix::Repository,
345 tree: &gix::Tree<'_>,
346 out: &mut HashSet<gix::ObjectId>,
347) {
348 for entry_ref in tree.iter() {
349 let Ok(entry) = entry_ref else { continue };
350 let oid = entry.oid().to_owned();
351 let mode = entry.mode();
352 if mode.is_tree() {
353 if let Ok(obj) = repo.find_object(oid) {
354 if let Ok(subtree) = obj.try_into_tree() {
355 walk_tree_for_blobs(repo, &subtree, out);
356 }
357 }
358 } else if mode.is_blob() {
359 out.insert(oid);
360 }
361 }
362}