project_rag/git/
chunker.rs

1use crate::git::walker::CommitInfo;
2use crate::indexer::CodeChunk;
3use crate::types::ChunkMetadata;
4use anyhow::Result;
5
6/// Converts git commits into chunks suitable for embedding
7pub struct CommitChunker {
8    /// Maximum content length before truncation
9    max_content_length: usize,
10}
11
12impl CommitChunker {
13    /// Create a new commit chunker with default settings
14    pub fn new() -> Self {
15        Self {
16            max_content_length: 6000, // ~1500 tokens for all-MiniLM-L6-v2
17        }
18    }
19
20    /// Create with custom max content length
21    pub fn with_max_length(max_content_length: usize) -> Self {
22        Self { max_content_length }
23    }
24
25    /// Convert a commit into a chunk for embedding
26    pub fn commit_to_chunk(
27        &self,
28        commit: &CommitInfo,
29        repo_path: &str,
30        project: Option<String>,
31    ) -> Result<CodeChunk> {
32        // Build searchable content: message + diff
33        let mut content = String::new();
34
35        // Add commit message
36        content.push_str("Commit Message:\n");
37        content.push_str(&commit.message);
38        content.push_str("\n\n");
39
40        // Add author info
41        content.push_str("Author: ");
42        content.push_str(&commit.author_name);
43        if !commit.author_email.is_empty() {
44            content.push_str(" <");
45            content.push_str(&commit.author_email);
46            content.push('>');
47        }
48        content.push_str("\n\n");
49
50        // Add files changed
51        if !commit.files_changed.is_empty() {
52            content.push_str("Files Changed:\n");
53            for file in &commit.files_changed {
54                content.push_str("- ");
55                content.push_str(file);
56                content.push('\n');
57            }
58            content.push('\n');
59        }
60
61        // Add diff content
62        if !commit.diff_content.is_empty() {
63            content.push_str("Diff:\n");
64            content.push_str(&commit.diff_content);
65        }
66
67        // Truncate if too long
68        if content.len() > self.max_content_length {
69            content.truncate(self.max_content_length);
70            content.push_str("\n\n[... content truncated for embedding ...]");
71        }
72
73        // Create chunk metadata
74        // Note: Git commits don't have line numbers, so we use 0
75        let metadata = ChunkMetadata {
76            file_path: format!("git://{}", repo_path),
77            root_path: None,
78            project,
79            start_line: 0,
80            end_line: 0,
81            language: Some("git-commit".to_string()),
82            extension: Some("commit".to_string()),
83            file_hash: commit.hash.clone(),
84            indexed_at: commit.commit_date,
85        };
86
87        Ok(CodeChunk { content, metadata })
88    }
89
90    /// Batch convert commits to chunks
91    pub fn commits_to_chunks(
92        &self,
93        commits: &[CommitInfo],
94        repo_path: &str,
95        project: Option<String>,
96    ) -> Result<Vec<CodeChunk>> {
97        commits
98            .iter()
99            .map(|commit| self.commit_to_chunk(commit, repo_path, project.clone()))
100            .collect()
101    }
102}
103
104impl Default for CommitChunker {
105    fn default() -> Self {
106        Self::new()
107    }
108}
109
110#[cfg(test)]
111mod tests {
112    use super::*;
113
114    fn create_test_commit() -> CommitInfo {
115        CommitInfo {
116            hash: "abc123def456".to_string(),
117            message:
118                "Fix authentication bug\n\nThis commit fixes a critical bug in the auth module."
119                    .to_string(),
120            author_name: "John Doe".to_string(),
121            author_email: "john@example.com".to_string(),
122            commit_date: 1704067200, // 2024-01-01
123            files_changed: vec!["src/auth.rs".to_string(), "tests/auth_tests.rs".to_string()],
124            diff_content: "@@ -10,7 +10,7 @@\n-    old_line\n+    new_line\n".to_string(),
125            parent_hashes: vec!["parent123".to_string()],
126        }
127    }
128
129    #[test]
130    fn test_commit_to_chunk() {
131        let chunker = CommitChunker::new();
132        let commit = create_test_commit();
133
134        let chunk = chunker
135            .commit_to_chunk(&commit, "/repo/path", None)
136            .expect("Should convert commit to chunk");
137
138        assert_eq!(chunk.metadata.file_path, "git:///repo/path");
139        assert_eq!(chunk.metadata.language, Some("git-commit".to_string()));
140        assert_eq!(chunk.metadata.file_hash, "abc123def456");
141        assert!(chunk.content.contains("Fix authentication bug"));
142        assert!(chunk.content.contains("John Doe"));
143        assert!(chunk.content.contains("src/auth.rs"));
144        assert!(chunk.content.contains("new_line"));
145    }
146
147    #[test]
148    fn test_content_truncation() {
149        let chunker = CommitChunker::with_max_length(100);
150        let mut commit = create_test_commit();
151        commit.diff_content = "x".repeat(10000); // Very large diff
152
153        let chunk = chunker
154            .commit_to_chunk(&commit, "/repo/path", None)
155            .expect("Should convert commit");
156
157        assert!(chunk.content.len() <= 150); // 100 + truncation message
158        assert!(chunk.content.contains("[... content truncated"));
159    }
160
161    #[test]
162    fn test_commits_to_chunks_batch() {
163        let chunker = CommitChunker::new();
164        let commits = vec![create_test_commit(), {
165            let mut c = create_test_commit();
166            c.hash = "different_hash".to_string();
167            c
168        }];
169
170        let chunks = chunker
171            .commits_to_chunks(&commits, "/repo/path", Some("my-project".to_string()))
172            .expect("Should convert batch");
173
174        assert_eq!(chunks.len(), 2);
175        assert_eq!(chunks[0].metadata.file_hash, "abc123def456");
176        assert_eq!(chunks[1].metadata.file_hash, "different_hash");
177        assert_eq!(chunks[0].metadata.project, Some("my-project".to_string()));
178    }
179
180    #[test]
181    fn test_empty_author_email() {
182        let chunker = CommitChunker::new();
183        let mut commit = create_test_commit();
184        commit.author_email = String::new();
185
186        let chunk = chunker
187            .commit_to_chunk(&commit, "/repo/path", None)
188            .expect("Should handle empty email");
189
190        assert!(chunk.content.contains("John Doe"));
191        assert!(!chunk.content.contains("<>"));
192    }
193
194    #[test]
195    fn test_no_files_changed() {
196        let chunker = CommitChunker::new();
197        let mut commit = create_test_commit();
198        commit.files_changed = vec![];
199
200        let chunk = chunker
201            .commit_to_chunk(&commit, "/repo/path", None)
202            .expect("Should handle no files");
203
204        assert!(!chunk.content.contains("Files Changed:"));
205    }
206
207    #[test]
208    fn test_no_diff_content() {
209        let chunker = CommitChunker::new();
210        let mut commit = create_test_commit();
211        commit.diff_content = String::new();
212
213        let chunk = chunker
214            .commit_to_chunk(&commit, "/repo/path", None)
215            .expect("Should handle no diff");
216
217        assert!(!chunk.content.contains("Diff:"));
218    }
219}