1use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
5use std::path::{Path, PathBuf};
6use std::process::Command;
7
8pub struct GitHistorySource {
21 repo_path: PathBuf,
22 max_commits: Option<usize>,
23}
24
25impl GitHistorySource {
26 pub fn new(repo_path: PathBuf) -> Self {
39 Self {
40 repo_path,
41 max_commits: None,
42 }
43 }
44
45 pub fn with_max_commits(mut self, n: usize) -> Self {
58 self.max_commits = Some(n);
59 self
60 }
61}
62
63impl Source for GitHistorySource {
64 fn name(&self) -> &str {
65 "git-history"
66 }
67
68 fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
69 match stream_git_history_chunks(&self.repo_path, self.max_commits) {
70 Ok(iter) => Box::new(iter),
71 Err(error) => Box::new(std::iter::once(Err(error))),
72 }
73 }
74 fn as_any(&self) -> &dyn std::any::Any {
75 self
76 }
77}
78
79fn stream_git_history_chunks(
80 repo_path: &Path,
81 max_commits: Option<usize>,
82) -> Result<impl Iterator<Item = Result<Chunk, SourceError>>, SourceError> {
83 let repo_arg = super::validate_repo_path(repo_path)?;
84 let mut command = Command::new(super::git_bin()?);
85 command.args([
86 "-C",
87 &repo_arg,
88 "log",
89 "--date=iso-strict",
90 "--format=commit %H%nAuthor: %an <%ae>%nDate: %aI",
91 "-p",
92 "-m",
93 ]);
94
95 if let Some(limit) = max_commits {
96 command.args(["--max-count", &limit.to_string()]);
97 }
98
99 command.arg("--end-of-options");
100 command.stdout(std::process::Stdio::piped());
101 command.stderr(std::process::Stdio::piped());
102
103 let mut child = command.spawn().map_err(SourceError::Io)?;
104 let stdout = child
105 .stdout
106 .take()
107 .ok_or_else(|| SourceError::Io(std::io::Error::other("missing stdout")))?;
108 let mut reader = std::io::BufReader::new(stdout);
109
110 let mut current_commit: Option<String> = None;
111 let mut current_author: Option<String> = None;
112 let mut current_date: Option<String> = None;
113 let mut current_path: Option<String> = None;
114 let mut current_content = String::new();
115 let mut in_hunk = false;
116 let mut done = false;
117 let mut line_buf = Vec::new();
118
119 Ok(std::iter::from_fn(move || {
120 if done {
121 return None;
122 }
123
124 loop {
125 line_buf.clear();
126 let line = match std::io::BufRead::read_until(&mut reader, b'\n', &mut line_buf) {
127 Ok(0) => {
128 done = true;
129 if let (Some(commit), Some(author), Some(date), Some(path)) = (
130 ¤t_commit,
131 ¤t_author,
132 ¤t_date,
133 ¤t_path,
134 ) {
135 if !current_content.trim().is_empty() {
136 return Some(Ok(Chunk {
137 data: current_content.trim().to_string().into(),
138 metadata: ChunkMetadata {
139 base_offset: 0,
140 source_type: "git-history".into(),
141 path: Some(path.clone()),
142 commit: Some(commit.clone()),
143 author: Some(author.clone()),
144 date: Some(date.clone()),
145 mtime_ns: None,
146 size_bytes: None,
147},
148 }));
149 }
150 }
151 return None;
152 }
153 Ok(_) => {
154 let l = String::from_utf8_lossy(&line_buf);
155 l.trim_end_matches('\n').trim_end_matches('\r').to_string()
156 }
157 Err(e) => {
158 done = true;
159 return Some(Err(SourceError::Io(e)));
160 }
161 };
162
163 if let Some(commit) = line.strip_prefix("commit ") {
164 let prev_chunk = if let (Some(commit), Some(author), Some(date), Some(path)) = (
165 ¤t_commit,
166 ¤t_author,
167 ¤t_date,
168 ¤t_path,
169 ) {
170 if !current_content.trim().is_empty() {
171 Some(Chunk {
172 data: current_content.trim().to_string().into(),
173 metadata: ChunkMetadata {
174 base_offset: 0,
175 source_type: "git-history".into(),
176 path: Some(path.clone()),
177 commit: Some(commit.clone()),
178 author: Some(author.clone()),
179 date: Some(date.clone()),
180 mtime_ns: None,
181 size_bytes: None,
182},
183 })
184 } else {
185 None
186 }
187 } else {
188 None
189 };
190
191 current_commit = Some(commit.trim().to_string());
192 current_author = None;
193 current_date = None;
194 current_path = None;
195 current_content.clear();
196 in_hunk = false;
197
198 if let Some(chunk) = prev_chunk {
199 return Some(Ok(chunk));
200 }
201 continue;
202 }
203
204 if let Some(author) = line.strip_prefix("Author: ") {
205 current_author = Some(author.trim().to_string());
206 continue;
207 }
208
209 if let Some(date) = line.strip_prefix("Date: ") {
210 current_date = Some(date.trim().to_string());
211 continue;
212 }
213
214 if line.starts_with("diff --git ") {
215 let prev_chunk = if let (Some(commit), Some(author), Some(date), Some(path)) = (
216 ¤t_commit,
217 ¤t_author,
218 ¤t_date,
219 ¤t_path,
220 ) {
221 if !current_content.trim().is_empty() {
222 Some(Chunk {
223 data: current_content.trim().to_string().into(),
224 metadata: ChunkMetadata {
225 base_offset: 0,
226 source_type: "git-history".into(),
227 path: Some(path.clone()),
228 commit: Some(commit.clone()),
229 author: Some(author.clone()),
230 date: Some(date.clone()),
231 mtime_ns: None,
232 size_bytes: None,
233},
234 })
235 } else {
236 None
237 }
238 } else {
239 None
240 };
241
242 current_path = extract_new_path(&line);
243 current_content.clear();
244 in_hunk = false;
245
246 if let Some(chunk) = prev_chunk {
247 return Some(Ok(chunk));
248 }
249 continue;
250 }
251
252 if line.starts_with("new file mode")
253 || line.starts_with("index ")
254 || line.starts_with("--- ")
255 {
256 continue;
257 }
258
259 if let Some(path_part) = line.strip_prefix("+++ b/") {
260 current_path = sanitize_path(path_part);
261 continue;
262 }
263
264 if line.starts_with("@@") && line.contains("@@") {
265 in_hunk = true;
266 continue;
267 }
268
269 if (in_hunk || line.starts_with('+'))
270 && line.starts_with('+')
271 && !line.starts_with("+++")
272 {
273 current_content.push_str(&line[1..]);
274 current_content.push('\n');
275 }
276
277 if current_content.len() > 10 * 1024 * 1024 {
279 if let (Some(commit), Some(author), Some(date), Some(path)) = (
280 ¤t_commit,
281 ¤t_author,
282 ¤t_date,
283 ¤t_path,
284 ) {
285 let chunk_content = current_content.trim().to_string();
286 current_content.clear();
287 return Some(Ok(Chunk {
288 data: chunk_content.into(),
289 metadata: ChunkMetadata {
290 base_offset: 0,
291 source_type: "git-history".into(),
292 path: Some(path.clone()),
293 commit: Some(commit.clone()),
294 author: Some(author.clone()),
295 date: Some(date.clone()),
296 mtime_ns: None,
297 size_bytes: None,
298},
299 }));
300 }
301 }
302 }
303 }))
304}
305
306fn extract_new_path(line: &str) -> Option<String> {
307 line.find(" b/")
308 .and_then(|index| sanitize_path(&line[index + 3..]))
309}
310
311fn sanitize_path(path: &str) -> Option<String> {
312 let path = path.trim().replace('\\', "/");
313 if path.is_empty() || path == "/dev/null" {
314 return None;
315 }
316
317 let candidate = Path::new(&path);
318 if candidate.is_absolute() || path.chars().any(char::is_control) {
319 return None;
320 }
321
322 let mut normalized = Vec::new();
323 for component in candidate.components() {
324 match component {
325 std::path::Component::CurDir => {}
326 std::path::Component::Normal(part) => {
327 normalized.push(part.to_string_lossy().into_owned());
328 }
329 std::path::Component::ParentDir => {
330 normalized.pop()?;
331 }
332 std::path::Component::RootDir | std::path::Component::Prefix(_) => {
333 return None;
334 }
335 }
336 }
337
338 if normalized.is_empty() {
339 None
340 } else {
341 Some(normalized.join("/"))
342 }
343}