Skip to main content

keyhog_sources/git/
history.rs

1//! Git history source: scans all commits in a repository's history for secrets
2//! that may have been committed and later removed.
3
4use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
5use std::path::{Path, PathBuf};
6use std::process::Command;
7
8/// Scans git history commit-by-commit using patch output and extracts added lines.
9///
10/// # Examples
11///
12/// ```rust
13/// use keyhog_core::Source;
14/// use keyhog_sources::GitHistorySource;
15/// use std::path::PathBuf;
16///
17/// let source = GitHistorySource::new(PathBuf::from(".")).with_max_commits(25);
18/// assert_eq!(source.name(), "git-history");
19/// ```
20pub struct GitHistorySource {
21    repo_path: PathBuf,
22    max_commits: Option<usize>,
23}
24
25impl GitHistorySource {
26    /// Create a source that scans commit history patches for added lines.
27    ///
28    /// # Examples
29    ///
30    /// ```rust
31    /// use keyhog_core::Source;
32    /// use keyhog_sources::GitHistorySource;
33    /// use std::path::PathBuf;
34    ///
35    /// let source = GitHistorySource::new(PathBuf::from("."));
36    /// assert_eq!(source.name(), "git-history");
37    /// ```
38    pub fn new(repo_path: PathBuf) -> Self {
39        Self {
40            repo_path,
41            max_commits: None,
42        }
43    }
44
45    /// Limit how many commits are traversed from `HEAD`.
46    ///
47    /// # Examples
48    ///
49    /// ```rust
50    /// use keyhog_core::Source;
51    /// use keyhog_sources::GitHistorySource;
52    /// use std::path::PathBuf;
53    ///
54    /// let source = GitHistorySource::new(PathBuf::from(".")).with_max_commits(2);
55    /// assert_eq!(source.name(), "git-history");
56    /// ```
57    pub fn with_max_commits(mut self, n: usize) -> Self {
58        self.max_commits = Some(n);
59        self
60    }
61}
62
63impl Source for GitHistorySource {
64    fn name(&self) -> &str {
65        "git-history"
66    }
67
68    fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
69        match stream_git_history_chunks(&self.repo_path, self.max_commits) {
70            Ok(iter) => Box::new(iter),
71            Err(error) => Box::new(std::iter::once(Err(error))),
72        }
73    }
74    fn as_any(&self) -> &dyn std::any::Any {
75        self
76    }
77}
78
79fn stream_git_history_chunks(
80    repo_path: &Path,
81    max_commits: Option<usize>,
82) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
83    let repo_arg = super::validate_repo_path(repo_path)?;
84    let mut command = Command::new(super::git_bin()?);
85    command.args([
86        "-C",
87        &repo_arg,
88        "log",
89        "--date=iso-strict",
90        "--format=commit %H%nAuthor: %an <%ae>%nDate: %aI",
91        "-p",
92        "-m",
93    ]);
94
95    if let Some(limit) = max_commits {
96        command.args(["--max-count", &limit.to_string()]);
97    }
98
99    command.arg("--end-of-options");
100    command.stdout(std::process::Stdio::piped());
101    command.stderr(std::process::Stdio::piped());
102
103    let mut child = command.spawn().map_err(SourceError::Io)?;
104    let stdout = child
105        .stdout
106        .take()
107        .ok_or_else(|| SourceError::Io(std::io::Error::other("missing stdout")))?;
108    let mut reader = std::io::BufReader::new(stdout);
109
110    let mut current_commit: Option<String> = None;
111    let mut current_author: Option<String> = None;
112    let mut current_date: Option<String> = None;
113    let mut current_path: Option<String> = None;
114    let mut current_content = String::new();
115    let mut in_hunk = false;
116    let mut done = false;
117    let mut line_buf = Vec::new();
118
119    Ok(std::iter::from_fn(move || {
120        if done {
121            return None;
122        }
123
124        loop {
125            line_buf.clear();
126            let line = match std::io::BufRead::read_until(&mut reader, b'\n', &mut line_buf) {
127                Ok(0) => {
128                    done = true;
129                    if let (Some(commit), Some(author), Some(date), Some(path)) = (
130                        &current_commit,
131                        &current_author,
132                        &current_date,
133                        &current_path,
134                    ) {
135                        if !current_content.trim().is_empty() {
136                            return Some(Ok(Chunk {
137                                data: current_content.trim().to_string().into(),
138                                metadata: ChunkMetadata {
139                                    base_offset: 0,
140                                    source_type: "git-history".into(),
141                                    path: Some(path.clone()),
142                                    commit: Some(commit.clone()),
143                                    author: Some(author.clone()),
144                                    date: Some(date.clone()),
145                                                                    mtime_ns: None,
146                                    size_bytes: None,
147},
148                            }));
149                        }
150                    }
151                    return None;
152                }
153                Ok(_) => {
154                    let l = String::from_utf8_lossy(&line_buf);
155                    l.trim_end_matches('\n').trim_end_matches('\r').to_string()
156                }
157                Err(e) => {
158                    done = true;
159                    return Some(Err(SourceError::Io(e)));
160                }
161            };
162
163            if let Some(commit) = line.strip_prefix("commit ") {
164                let prev_chunk = if let (Some(commit), Some(author), Some(date), Some(path)) = (
165                    &current_commit,
166                    &current_author,
167                    &current_date,
168                    &current_path,
169                ) {
170                    if !current_content.trim().is_empty() {
171                        Some(Chunk {
172                            data: current_content.trim().to_string().into(),
173                            metadata: ChunkMetadata {
174                                base_offset: 0,
175                                source_type: "git-history".into(),
176                                path: Some(path.clone()),
177                                commit: Some(commit.clone()),
178                                author: Some(author.clone()),
179                                date: Some(date.clone()),
180                                                            mtime_ns: None,
181                                size_bytes: None,
182},
183                        })
184                    } else {
185                        None
186                    }
187                } else {
188                    None
189                };
190
191                current_commit = Some(commit.trim().to_string());
192                current_author = None;
193                current_date = None;
194                current_path = None;
195                current_content.clear();
196                in_hunk = false;
197
198                if let Some(chunk) = prev_chunk {
199                    return Some(Ok(chunk));
200                }
201                continue;
202            }
203
204            if let Some(author) = line.strip_prefix("Author: ") {
205                current_author = Some(author.trim().to_string());
206                continue;
207            }
208
209            if let Some(date) = line.strip_prefix("Date: ") {
210                current_date = Some(date.trim().to_string());
211                continue;
212            }
213
214            if line.starts_with("diff --git ") {
215                let prev_chunk = if let (Some(commit), Some(author), Some(date), Some(path)) = (
216                    &current_commit,
217                    &current_author,
218                    &current_date,
219                    &current_path,
220                ) {
221                    if !current_content.trim().is_empty() {
222                        Some(Chunk {
223                            data: current_content.trim().to_string().into(),
224                            metadata: ChunkMetadata {
225                                base_offset: 0,
226                                source_type: "git-history".into(),
227                                path: Some(path.clone()),
228                                commit: Some(commit.clone()),
229                                author: Some(author.clone()),
230                                date: Some(date.clone()),
231                                                            mtime_ns: None,
232                                size_bytes: None,
233},
234                        })
235                    } else {
236                        None
237                    }
238                } else {
239                    None
240                };
241
242                current_path = extract_new_path(&line);
243                current_content.clear();
244                in_hunk = false;
245
246                if let Some(chunk) = prev_chunk {
247                    return Some(Ok(chunk));
248                }
249                continue;
250            }
251
252            if line.starts_with("new file mode")
253                || line.starts_with("index ")
254                || line.starts_with("--- ")
255            {
256                continue;
257            }
258
259            if let Some(path_part) = line.strip_prefix("+++ b/") {
260                current_path = sanitize_path(path_part);
261                continue;
262            }
263
264            if line.starts_with("@@") && line.contains("@@") {
265                in_hunk = true;
266                continue;
267            }
268
269            if (in_hunk || line.starts_with('+'))
270                && line.starts_with('+')
271                && !line.starts_with("+++")
272            {
273                current_content.push_str(&line[1..]);
274                current_content.push('\n');
275            }
276
277            // Safety cap to prevent unlimited memory growth per file hunk
278            if current_content.len() > 10 * 1024 * 1024 {
279                if let (Some(commit), Some(author), Some(date), Some(path)) = (
280                    &current_commit,
281                    &current_author,
282                    &current_date,
283                    &current_path,
284                ) {
285                    let chunk_content = current_content.trim().to_string();
286                    current_content.clear();
287                    return Some(Ok(Chunk {
288                        data: chunk_content.into(),
289                        metadata: ChunkMetadata {
290                            base_offset: 0,
291                            source_type: "git-history".into(),
292                            path: Some(path.clone()),
293                            commit: Some(commit.clone()),
294                            author: Some(author.clone()),
295                            date: Some(date.clone()),
296                                                    mtime_ns: None,
297                            size_bytes: None,
298},
299                    }));
300                }
301            }
302        }
303    }))
304}
305
306fn extract_new_path(line: &str) -> Option<String> {
307    line.find(" b/")
308        .and_then(|index| sanitize_path(&line[index + 3..]))
309}
310
311fn sanitize_path(path: &str) -> Option<String> {
312    let path = path.trim().replace('\\', "/");
313    if path.is_empty() || path == "/dev/null" {
314        return None;
315    }
316
317    let candidate = Path::new(&path);
318    if candidate.is_absolute() || path.chars().any(char::is_control) {
319        return None;
320    }
321
322    let mut normalized = Vec::new();
323    for component in candidate.components() {
324        match component {
325            std::path::Component::CurDir => {}
326            std::path::Component::Normal(part) => {
327                normalized.push(part.to_string_lossy().into_owned());
328            }
329            std::path::Component::ParentDir => {
330                normalized.pop()?;
331            }
332            std::path::Component::RootDir | std::path::Component::Prefix(_) => {
333                return None;
334            }
335        }
336    }
337
338    if normalized.is_empty() {
339        None
340    } else {
341        Some(normalized.join("/"))
342    }
343}