project_rag/git/
walker.rs1use anyhow::{Context, Result};
2use git2::{DiffOptions, Repository, Sort};
3use std::collections::HashSet;
4use std::path::{Path, PathBuf};
5
6#[derive(Debug, Clone)]
8pub struct CommitInfo {
9 pub hash: String,
11 pub message: String,
13 pub author_name: String,
15 pub author_email: String,
17 pub commit_date: i64,
19 pub files_changed: Vec<String>,
21 pub diff_content: String,
23 pub parent_hashes: Vec<String>,
25}
26
27pub struct GitWalker {
29 repo: Repository,
30 repo_path: PathBuf,
31}
32
33impl GitWalker {
34 pub fn discover<P: AsRef<Path>>(path: P) -> Result<Self> {
36 let path = path.as_ref();
37
38 let repo_path = Repository::discover(path)
40 .context("Failed to discover git repository")?
41 .path()
42 .parent()
43 .context("Invalid repository path")?
44 .to_path_buf();
45
46 let repo = Repository::open(&repo_path).context("Failed to open git repository")?;
47
48 tracing::info!("Opened git repository at: {}", repo_path.display());
49
50 Ok(Self { repo, repo_path })
51 }
52
53 pub fn repo_path(&self) -> &Path {
55 &self.repo_path
56 }
57
58 pub fn current_branch(&self) -> Option<String> {
60 self.repo.head().ok()?.shorthand().map(|s| s.to_string())
61 }
62
63 pub fn iter_commits(
65 &self,
66 branch: Option<&str>,
67 max_count: Option<usize>,
68 since_date: Option<i64>,
69 until_date: Option<i64>,
70 skip_hashes: &HashSet<String>,
71 ) -> Result<Vec<CommitInfo>> {
72 let mut revwalk = self.repo.revwalk()?;
73 revwalk.set_sorting(Sort::TIME | Sort::TOPOLOGICAL)?;
74
75 if let Some(branch_name) = branch {
77 let reference = self
78 .repo
79 .find_branch(branch_name, git2::BranchType::Local)
80 .context("Failed to find branch")?;
81 let oid = reference.get().target().context("Branch has no target")?;
82 revwalk.push(oid)?;
83 } else {
84 revwalk.push_head()?;
86 }
87
88 let mut commits = Vec::new();
89 let mut count = 0;
90 let max = max_count.unwrap_or(usize::MAX);
91
92 for oid in revwalk {
93 if count >= max {
94 break;
95 }
96
97 let oid = oid?;
98 let commit = self.repo.find_commit(oid)?;
99 let commit_hash = format!("{}", commit.id());
100
101 if skip_hashes.contains(&commit_hash) {
103 tracing::debug!("Skipping already indexed commit: {}", commit_hash);
104 continue;
105 }
106
107 let commit_time = commit.time().seconds();
108
109 if let Some(since) = since_date
111 && commit_time < since
112 {
113 break; }
115
116 if let Some(until) = until_date
117 && commit_time > until
118 {
119 continue;
120 }
121
122 let commit_info = self.extract_commit_info(&commit)?;
124 commits.push(commit_info);
125 count += 1;
126
127 if count % 50 == 0 {
128 tracing::debug!("Processed {} commits", count);
129 }
130 }
131
132 tracing::info!("Extracted {} new commits", commits.len());
133 Ok(commits)
134 }
135
136 fn extract_commit_info(&self, commit: &git2::Commit) -> Result<CommitInfo> {
138 let hash = format!("{}", commit.id());
139 let message = commit.message().unwrap_or("").to_string();
140 let author = commit.author();
141 let author_name = author.name().unwrap_or("Unknown").to_string();
142 let author_email = author.email().unwrap_or("").to_string();
143 let commit_date = commit.time().seconds();
144
145 let parent_hashes: Vec<String> = commit.parents().map(|p| format!("{}", p.id())).collect();
147
148 let (files_changed, diff_content) = self.extract_diff(commit)?;
150
151 Ok(CommitInfo {
152 hash,
153 message,
154 author_name,
155 author_email,
156 commit_date,
157 files_changed,
158 diff_content,
159 parent_hashes,
160 })
161 }
162
163 fn extract_diff(&self, commit: &git2::Commit) -> Result<(Vec<String>, String)> {
165 let mut files_changed = Vec::new();
166 let mut diff_content = String::new();
167 let mut diff_truncated = false;
168
169 let tree = commit.tree()?;
170
171 let parent_tree = if commit.parent_count() > 0 {
173 Some(commit.parent(0)?.tree()?)
174 } else {
175 None
176 };
177
178 let mut diff_opts = DiffOptions::new();
179 diff_opts
180 .context_lines(3)
181 .interhunk_lines(0)
182 .ignore_whitespace(false);
183
184 let diff = if let Some(parent) = parent_tree {
185 self.repo
186 .diff_tree_to_tree(Some(&parent), Some(&tree), Some(&mut diff_opts))?
187 } else {
188 self.repo
190 .diff_tree_to_tree(None, Some(&tree), Some(&mut diff_opts))?
191 };
192
193 for delta in diff.deltas() {
195 if let Some(path) = delta.new_file().path() {
196 files_changed.push(path.display().to_string());
197 }
198 }
199
200 diff.print(git2::DiffFormat::Patch, |_delta, _hunk, line| {
202 if diff_truncated {
204 return true;
205 }
206
207 if line.origin() == 'B' {
209 return true;
210 }
211
212 if diff_content.len() >= 100_000 {
214 diff_truncated = true;
215 return true; }
217
218 let origin = line.origin();
220 if let Ok(content) = std::str::from_utf8(line.content()) {
221 match origin {
222 '+' | '-' | ' ' => {
223 diff_content.push(origin);
224 diff_content.push_str(content);
225 }
226 'F' => {
227 diff_content.push_str("--- ");
229 diff_content.push_str(content);
230 }
231 'H' => {
232 diff_content.push_str(content);
234 }
235 _ => {}
236 }
237 } else {
238 tracing::debug!("Skipping diff line with invalid UTF-8");
240 }
241
242 true
244 })?;
245
246 if diff_content.len() > 8000 {
248 diff_content.truncate(8000);
249 diff_content.push_str("\n\n[... diff truncated ...]");
250 tracing::warn!("Truncated large diff for commit {}", commit.id());
251 }
252
253 Ok((files_changed, diff_content))
254 }
255
256 pub fn has_commits(&self) -> bool {
258 self.repo.head().is_ok()
259 }
260
261 pub fn estimate_commit_count(&self) -> Result<usize> {
263 let mut revwalk = self.repo.revwalk()?;
264 revwalk.push_head()?;
265 Ok(revwalk.count())
266 }
267}
268
269#[cfg(test)]
270mod tests {
271 use super::*;
272
273 #[test]
274 fn test_discover_current_repo() {
275 let walker = GitWalker::discover(".").expect("Should find git repo");
277 assert!(walker.repo_path().exists());
278 assert!(walker.has_commits());
279 }
280
281 #[test]
282 fn test_current_branch() {
283 let walker = GitWalker::discover(".").expect("Should find git repo");
284 let branch = walker.current_branch();
285 assert!(branch.is_some(), "Should have a current branch");
286 }
287
288 #[test]
289 fn test_iter_commits_limited() {
290 let walker = GitWalker::discover(".").expect("Should find git repo");
291 let skip = HashSet::new();
292
293 let commits = walker
294 .iter_commits(None, Some(5), None, None, &skip)
295 .expect("Should iterate commits");
296
297 assert!(commits.len() <= 5, "Should respect max_count");
298
299 for commit in &commits {
300 assert!(!commit.hash.is_empty(), "Commit hash should not be empty");
301 assert!(
302 !commit.author_name.is_empty(),
303 "Author name should not be empty"
304 );
305 }
306 }
307
308 #[test]
309 fn test_commit_info_structure() {
310 let walker = GitWalker::discover(".").expect("Should find git repo");
311 let skip = HashSet::new();
312
313 let commits = walker
314 .iter_commits(None, Some(1), None, None, &skip)
315 .expect("Should get commits");
316
317 if let Some(commit) = commits.first() {
318 assert_eq!(commit.hash.len(), 40, "Git SHA should be 40 chars");
319 assert!(commit.commit_date > 0, "Commit date should be positive");
320 }
321 }
322
323 #[test]
324 fn test_skip_hashes() {
325 let walker = GitWalker::discover(".").expect("Should find git repo");
326 let skip = HashSet::new();
327
328 let commits = walker
330 .iter_commits(None, Some(1), None, None, &skip)
331 .expect("Should get commits");
332
333 if let Some(first_commit) = commits.first() {
334 let mut skip_set = HashSet::new();
335 skip_set.insert(first_commit.hash.clone());
336
337 let commits2 = walker
339 .iter_commits(None, Some(1), None, None, &skip_set)
340 .expect("Should get commits");
341
342 if let Some(second_commit) = commits2.first() {
344 assert_ne!(
345 first_commit.hash, second_commit.hash,
346 "Should skip specified commit"
347 );
348 }
349 }
350 }
351}