project_rag/git/
walker.rs

1use anyhow::{Context, Result};
2use git2::{DiffOptions, Repository, Sort};
3use std::collections::HashSet;
4use std::path::{Path, PathBuf};
5
6/// Information about a git commit
7#[derive(Debug, Clone)]
8pub struct CommitInfo {
9    /// Full commit SHA hash (40 characters)
10    pub hash: String,
11    /// Commit message (first line and body)
12    pub message: String,
13    /// Author's name
14    pub author_name: String,
15    /// Author's email address
16    pub author_email: String,
17    /// Commit timestamp (Unix epoch seconds)
18    pub commit_date: i64,
19    /// List of file paths changed in this commit
20    pub files_changed: Vec<String>,
21    /// Unified diff content (truncated if too large)
22    pub diff_content: String,
23    /// SHA hashes of parent commits
24    pub parent_hashes: Vec<String>,
25}
26
27/// Git repository walker for extracting commit information
28pub struct GitWalker {
29    repo: Repository,
30    repo_path: PathBuf,
31}
32
33impl GitWalker {
34    /// Discover and open a git repository from any path within it
35    pub fn discover<P: AsRef<Path>>(path: P) -> Result<Self> {
36        let path = path.as_ref();
37
38        // Discover the repository (walks up directory tree)
39        let repo_path = Repository::discover(path)
40            .context("Failed to discover git repository")?
41            .path()
42            .parent()
43            .context("Invalid repository path")?
44            .to_path_buf();
45
46        let repo = Repository::open(&repo_path).context("Failed to open git repository")?;
47
48        tracing::info!("Opened git repository at: {}", repo_path.display());
49
50        Ok(Self { repo, repo_path })
51    }
52
53    /// Get the repository root path
54    pub fn repo_path(&self) -> &Path {
55        &self.repo_path
56    }
57
58    /// Get the current branch name, or None if detached HEAD
59    pub fn current_branch(&self) -> Option<String> {
60        self.repo.head().ok()?.shorthand().map(|s| s.to_string())
61    }
62
63    /// Iterate commits with filters
64    pub fn iter_commits(
65        &self,
66        branch: Option<&str>,
67        max_count: Option<usize>,
68        since_date: Option<i64>,
69        until_date: Option<i64>,
70        skip_hashes: &HashSet<String>,
71    ) -> Result<Vec<CommitInfo>> {
72        let mut revwalk = self.repo.revwalk()?;
73        revwalk.set_sorting(Sort::TIME | Sort::TOPOLOGICAL)?;
74
75        // Determine starting point
76        if let Some(branch_name) = branch {
77            let reference = self
78                .repo
79                .find_branch(branch_name, git2::BranchType::Local)
80                .context("Failed to find branch")?;
81            let oid = reference.get().target().context("Branch has no target")?;
82            revwalk.push(oid)?;
83        } else {
84            // Use HEAD
85            revwalk.push_head()?;
86        }
87
88        let mut commits = Vec::new();
89        let mut count = 0;
90        let max = max_count.unwrap_or(usize::MAX);
91
92        for oid in revwalk {
93            if count >= max {
94                break;
95            }
96
97            let oid = oid?;
98            let commit = self.repo.find_commit(oid)?;
99            let commit_hash = format!("{}", commit.id());
100
101            // Skip if already indexed
102            if skip_hashes.contains(&commit_hash) {
103                tracing::debug!("Skipping already indexed commit: {}", commit_hash);
104                continue;
105            }
106
107            let commit_time = commit.time().seconds();
108
109            // Apply date filters
110            if let Some(since) = since_date
111                && commit_time < since
112            {
113                break; // Commits are sorted, no need to continue
114            }
115
116            if let Some(until) = until_date
117                && commit_time > until
118            {
119                continue;
120            }
121
122            // Extract commit info
123            let commit_info = self.extract_commit_info(&commit)?;
124            commits.push(commit_info);
125            count += 1;
126
127            if count % 50 == 0 {
128                tracing::debug!("Processed {} commits", count);
129            }
130        }
131
132        tracing::info!("Extracted {} new commits", commits.len());
133        Ok(commits)
134    }
135
136    /// Extract detailed information from a commit
137    fn extract_commit_info(&self, commit: &git2::Commit) -> Result<CommitInfo> {
138        let hash = format!("{}", commit.id());
139        let message = commit.message().unwrap_or("").to_string();
140        let author = commit.author();
141        let author_name = author.name().unwrap_or("Unknown").to_string();
142        let author_email = author.email().unwrap_or("").to_string();
143        let commit_date = commit.time().seconds();
144
145        // Extract parent hashes
146        let parent_hashes: Vec<String> = commit.parents().map(|p| format!("{}", p.id())).collect();
147
148        // Get diff and changed files
149        let (files_changed, diff_content) = self.extract_diff(commit)?;
150
151        Ok(CommitInfo {
152            hash,
153            message,
154            author_name,
155            author_email,
156            commit_date,
157            files_changed,
158            diff_content,
159            parent_hashes,
160        })
161    }
162
163    /// Extract diff and list of changed files
164    fn extract_diff(&self, commit: &git2::Commit) -> Result<(Vec<String>, String)> {
165        let mut files_changed = Vec::new();
166        let mut diff_content = String::new();
167        let mut diff_truncated = false;
168
169        let tree = commit.tree()?;
170
171        // Get parent tree (if exists)
172        let parent_tree = if commit.parent_count() > 0 {
173            Some(commit.parent(0)?.tree()?)
174        } else {
175            None
176        };
177
178        let mut diff_opts = DiffOptions::new();
179        diff_opts
180            .context_lines(3)
181            .interhunk_lines(0)
182            .ignore_whitespace(false);
183
184        let diff = if let Some(parent) = parent_tree {
185            self.repo
186                .diff_tree_to_tree(Some(&parent), Some(&tree), Some(&mut diff_opts))?
187        } else {
188            // First commit - diff against empty tree
189            self.repo
190                .diff_tree_to_tree(None, Some(&tree), Some(&mut diff_opts))?
191        };
192
193        // Iterate through deltas (file changes)
194        for delta in diff.deltas() {
195            if let Some(path) = delta.new_file().path() {
196                files_changed.push(path.display().to_string());
197            }
198        }
199
200        // Generate diff text
201        diff.print(git2::DiffFormat::Patch, |_delta, _hunk, line| {
202            // Stop adding content if already truncated (but continue processing - return true)
203            if diff_truncated {
204                return true;
205            }
206
207            // Skip binary files
208            if line.origin() == 'B' {
209                return true;
210            }
211
212            // Check if we're approaching the size limit before processing
213            if diff_content.len() >= 100_000 {
214                diff_truncated = true;
215                return true; // Continue processing, just stop adding content
216            }
217
218            // Build diff content string - only if valid UTF-8
219            let origin = line.origin();
220            if let Ok(content) = std::str::from_utf8(line.content()) {
221                match origin {
222                    '+' | '-' | ' ' => {
223                        diff_content.push(origin);
224                        diff_content.push_str(content);
225                    }
226                    'F' => {
227                        // File header
228                        diff_content.push_str("--- ");
229                        diff_content.push_str(content);
230                    }
231                    'H' => {
232                        // Hunk header
233                        diff_content.push_str(content);
234                    }
235                    _ => {}
236                }
237            } else {
238                // Invalid UTF-8 - skip this line but continue processing
239                tracing::debug!("Skipping diff line with invalid UTF-8");
240            }
241
242            // Always return true to continue processing (don't signal error to git2)
243            true
244        })?;
245
246        // Truncate if too large and add marker
247        if diff_content.len() > 8000 {
248            diff_content.truncate(8000);
249            diff_content.push_str("\n\n[... diff truncated ...]");
250            tracing::warn!("Truncated large diff for commit {}", commit.id());
251        }
252
253        Ok((files_changed, diff_content))
254    }
255
256    /// Check if repository has any commits
257    pub fn has_commits(&self) -> bool {
258        self.repo.head().is_ok()
259    }
260
261    /// Get total commit count (approximation)
262    pub fn estimate_commit_count(&self) -> Result<usize> {
263        let mut revwalk = self.repo.revwalk()?;
264        revwalk.push_head()?;
265        Ok(revwalk.count())
266    }
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn test_discover_current_repo() {
275        // This test assumes we're running in the project-rag repository
276        let walker = GitWalker::discover(".").expect("Should find git repo");
277        assert!(walker.repo_path().exists());
278        assert!(walker.has_commits());
279    }
280
281    #[test]
282    fn test_current_branch() {
283        let walker = GitWalker::discover(".").expect("Should find git repo");
284        let branch = walker.current_branch();
285        assert!(branch.is_some(), "Should have a current branch");
286    }
287
288    #[test]
289    fn test_iter_commits_limited() {
290        let walker = GitWalker::discover(".").expect("Should find git repo");
291        let skip = HashSet::new();
292
293        let commits = walker
294            .iter_commits(None, Some(5), None, None, &skip)
295            .expect("Should iterate commits");
296
297        assert!(commits.len() <= 5, "Should respect max_count");
298
299        for commit in &commits {
300            assert!(!commit.hash.is_empty(), "Commit hash should not be empty");
301            assert!(
302                !commit.author_name.is_empty(),
303                "Author name should not be empty"
304            );
305        }
306    }
307
308    #[test]
309    fn test_commit_info_structure() {
310        let walker = GitWalker::discover(".").expect("Should find git repo");
311        let skip = HashSet::new();
312
313        let commits = walker
314            .iter_commits(None, Some(1), None, None, &skip)
315            .expect("Should get commits");
316
317        if let Some(commit) = commits.first() {
318            assert_eq!(commit.hash.len(), 40, "Git SHA should be 40 chars");
319            assert!(commit.commit_date > 0, "Commit date should be positive");
320        }
321    }
322
323    #[test]
324    fn test_skip_hashes() {
325        let walker = GitWalker::discover(".").expect("Should find git repo");
326        let skip = HashSet::new();
327
328        // Get first commit
329        let commits = walker
330            .iter_commits(None, Some(1), None, None, &skip)
331            .expect("Should get commits");
332
333        if let Some(first_commit) = commits.first() {
334            let mut skip_set = HashSet::new();
335            skip_set.insert(first_commit.hash.clone());
336
337            // Try again with that commit in skip set
338            let commits2 = walker
339                .iter_commits(None, Some(1), None, None, &skip_set)
340                .expect("Should get commits");
341
342            // Should get different commit (or fewer commits if only one exists)
343            if let Some(second_commit) = commits2.first() {
344                assert_ne!(
345                    first_commit.hash, second_commit.hash,
346                    "Should skip specified commit"
347                );
348            }
349        }
350    }
351}