git_indexer/
extraction.rs

1//! Git repository extraction functionality.
2//!
3//! This module provides functions to extract information from git repositories
4//! including branches, commits, and file diffs.
5
6use crate::error::{GitError, Result};
7use crate::models::{BranchInfo, ChangeType, CommitInfo, DiffHunk, FileChange, GitInfo, TagInfo};
8use gix::objs::tree::EntryKind;
9use gix::{Repository, discover};
10use imara_diff::{Algorithm, UnifiedDiffBuilder, diff};
11use std::collections::{HashMap, HashSet};
12use std::path::Path;
13
14/// Maximum file size for diff extraction (10MB).
15const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
16
17/// Extract git information from a repository.
18///
19/// # Arguments
20///
21/// * `repo_path` - Path to the git repository (or any subdirectory within it)
22///
23/// # Returns
24///
25/// Returns a `GitInfo` struct containing all branches and commits.
26///
27/// # Example
28///
29/// ```no_run
30/// use git_indexer::extraction::extract;
31/// use std::path::Path;
32///
33/// let git_info = extract(Path::new("/path/to/repo")).unwrap();
34/// println!("Found {} branches", git_info.branches.len());
35/// println!("Found {} commits", git_info.commits.len());
36/// ```
37pub fn extract(repo_path: &Path) -> Result<GitInfo> {
38    let repo = discover(repo_path).map_err(|e| GitError::RepositoryNotFound {
39        path: repo_path.to_path_buf(),
40        message: e.to_string(),
41    })?;
42
43    let repo: Repository = repo.into();
44    let branches = get_branches(&repo)?;
45    let tags = get_tags(&repo)?;
46    let commits = get_all_commits(&repo, &branches)?;
47
48    Ok(GitInfo { branches, tags, commits })
49}
50
51/// Organize git information into a JSON string grouped by branch.
52///
53/// # Arguments
54///
55/// * `git_info` - The extracted git information
56///
57/// # Returns
58///
59/// Returns a pretty-printed JSON string with commits organized by branch name.
60pub fn organize_info(git_info: &GitInfo) -> Result<String> {
61    // Create a map of commit_id -> commit for easy lookup
62    let commit_map: HashMap<&str, &CommitInfo> = git_info
63        .commits
64        .iter()
65        .map(|c| (c.id.as_str(), c))
66        .collect();
67
68    // Build structure: branch_name -> list of commits reachable from that branch
69    let mut branch_commits: HashMap<String, Vec<&CommitInfo>> = HashMap::new();
70
71    for branch in &git_info.branches {
72        let mut commits_for_branch = Vec::new();
73        let mut visited = HashSet::new();
74        let mut to_visit = Vec::new();
75
76        // Start from the branch's commit
77        to_visit.push(branch.commit_id.as_str());
78
79        // Traverse the commit graph
80        while let Some(commit_id) = to_visit.pop() {
81            if visited.contains(commit_id) {
82                continue;
83            }
84            visited.insert(commit_id);
85
86            if let Some(commit) = commit_map.get(commit_id) {
87                commits_for_branch.push(*commit);
88
89                // Add parents to visit
90                for parent_id in &commit.parent_ids {
91                    if !visited.contains(parent_id.as_str()) {
92                        to_visit.push(parent_id.as_str());
93                    }
94                }
95            }
96        }
97
98        // Sort commits by timestamp (newest first)
99        commits_for_branch.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
100
101        branch_commits.insert(branch.name.clone(), commits_for_branch);
102    }
103
104    // Convert to pretty-printed JSON string
105    Ok(serde_json::to_string_pretty(&branch_commits)?)
106}
107
108/// Get all branches (local and remote) from the repository.
109fn get_branches(repo: &Repository) -> Result<Vec<BranchInfo>> {
110    let mut branches = Vec::new();
111
112    let head = repo
113        .head()
114        .map_err(|e| GitError::ReferencesError(format!("Failed to get HEAD: {}", e)))?;
115    let head_name = head.referent_name().map(|n| n.as_bstr().to_string());
116
117    let refs = repo
118        .references()
119        .map_err(|e| GitError::ReferencesError(e.to_string()))?;
120
121    for reference in refs
122        .all()
123        .map_err(|e| GitError::ReferencesError(e.to_string()))?
124        .flatten()
125    {
126        let ref_name = reference.name().as_bstr().to_string();
127
128        // Local branches
129        if ref_name.starts_with("refs/heads/") {
130            let branch_name = ref_name.strip_prefix("refs/heads/").unwrap().to_string();
131            let is_head = head_name.as_ref().map_or(false, |h| h == &ref_name);
132            let commit_id = reference.id().to_hex().to_string();
133
134            branches.push(BranchInfo {
135                name: branch_name,
136                is_head,
137                commit_id,
138                is_remote: false,
139            });
140        }
141        // Remote branches
142        else if ref_name.starts_with("refs/remotes/") && !ref_name.ends_with("/HEAD") {
143            let branch_name = ref_name.strip_prefix("refs/remotes/").unwrap().to_string();
144            let commit_id = reference.id().to_hex().to_string();
145
146            branches.push(BranchInfo {
147                name: branch_name,
148                is_head: false,
149                commit_id,
150                is_remote: true,
151            });
152        }
153    }
154
155    Ok(branches)
156}
157
158/// Get all tags from the repository.
159fn get_tags(repo: &Repository) -> Result<Vec<TagInfo>> {
160    let mut tags = Vec::new();
161
162    let refs = repo
163        .references()
164        .map_err(|e| GitError::ReferencesError(e.to_string()))?;
165
166    for reference in refs
167        .prefixed("refs/tags/")
168        .map_err(|e| GitError::ReferencesError(e.to_string()))?
169        .flatten()
170    {
171        let ref_name = reference.name().as_bstr().to_string();
172        let tag_name = ref_name
173            .strip_prefix("refs/tags/")
174            .unwrap_or(&ref_name)
175            .to_string();
176
177        let target_id = reference.id().to_hex().to_string();
178
179        // Check if this is an annotated tag by trying to peel to a tag object
180        let (is_annotated, tagger, message) = match repo.find_object(reference.id().detach()) {
181            Ok(obj) => {
182                if let Ok(tag) = obj.try_into_tag() {
183                    if let Ok(decoded) = tag.decode() {
184                        let tagger_str = decoded.tagger.map(|sig| {
185                            format!("{} <{}>", sig.name, sig.email)
186                        });
187                        let msg = Some(decoded.message.to_string());
188                        (true, tagger_str, msg)
189                    } else {
190                        (true, None, None)
191                    }
192                } else {
193                    (false, None, None)
194                }
195            }
196            Err(_) => (false, None, None),
197        };
198
199        tags.push(TagInfo {
200            name: tag_name,
201            target_id,
202            is_annotated,
203            tagger,
204            message,
205        });
206    }
207
208    Ok(tags)
209}
210
211/// Collect all unique commits from all branches with their diffs.
212fn get_all_commits(repo: &Repository, branches: &[BranchInfo]) -> Result<Vec<CommitInfo>> {
213    let mut seen_commits: HashSet<String> = HashSet::new();
214    let mut commits: Vec<CommitInfo> = Vec::new();
215    let mut to_process: Vec<gix::ObjectId> = Vec::new();
216
217    // Collect all branch tips as starting points
218    for branch in branches {
219        if let Ok(oid) = gix::ObjectId::from_hex(branch.commit_id.as_bytes()) {
220            if !seen_commits.contains(&branch.commit_id) {
221                to_process.push(oid);
222            }
223        }
224    }
225
226    // Process commits in a breadth-first manner
227    while let Some(commit_id) = to_process.pop() {
228        let commit_id_str = commit_id.to_hex().to_string();
229
230        if seen_commits.contains(&commit_id_str) {
231            continue;
232        }
233        seen_commits.insert(commit_id_str.clone());
234
235        // Get commit object
236        let commit_obj = match repo.find_object(commit_id) {
237            Ok(obj) => obj,
238            Err(_) => continue,
239        };
240
241        let commit = match commit_obj.try_into_commit() {
242            Ok(c) => c,
243            Err(_) => continue,
244        };
245
246        let commit_decoded = match commit.decode() {
247            Ok(d) => d,
248            Err(_) => continue,
249        };
250
251        // Collect parent IDs
252        let parent_ids: Vec<String> = commit
253            .parent_ids()
254            .map(|id| id.to_hex().to_string())
255            .collect();
256
257        // Add parents to processing queue
258        for parent_id in commit.parent_ids() {
259            let parent_id_str = parent_id.to_hex().to_string();
260            if !seen_commits.contains(&parent_id_str) {
261                to_process.push(parent_id.detach());
262            }
263        }
264
265        // Extract file changes (diff against first parent, or empty for root commits)
266        let file_changes = get_commit_diff(repo, &commit, commit.parent_ids().next())?;
267
268        // tree sha  = dir structure
269        // Get tree SHA for this commit
270        let tree_id = match commit.tree() {
271            Ok(tree) => tree.id().to_hex().to_string(),
272            Err(_) => String::new(),
273        };
274
275        let message = commit_decoded.message();
276        let author = commit_decoded.author();
277
278        commits.push(CommitInfo {
279            id: commit_id_str,
280            tree_id,
281            message: message.title.to_string(),
282            author: format!("{} <{}>", author.name, author.email),
283            timestamp: author.seconds() as i64,
284            parent_ids,
285            file_changes,
286        });
287    }
288
289    // Sort by timestamp (newest first)
290    commits.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
291
292    Ok(commits)
293}
294
295/// Get file changes for a commit by diffing against its parent.
296fn get_commit_diff(
297    repo: &Repository,
298    commit: &gix::Commit,
299    parent_id: Option<gix::Id>,
300) -> Result<Vec<FileChange>> {
301    let mut file_changes = Vec::new();
302
303    // Get the commit's tree
304    let commit_tree = commit
305        .tree()
306        .map_err(|e| GitError::TreeReadError(e.to_string()))?;
307
308    // Get parent's tree (or use empty tree for root commits)
309    let parent_tree = if let Some(pid) = parent_id {
310        let parent_obj = repo
311            .find_object(pid.detach())
312            .map_err(|e| GitError::TreeReadError(e.to_string()))?;
313        let parent_commit = parent_obj
314            .try_into_commit()
315            .map_err(|e| GitError::TreeReadError(e.to_string()))?;
316        Some(
317            parent_commit
318                .tree()
319                .map_err(|e| GitError::TreeReadError(e.to_string()))?,
320        )
321    } else {
322        None
323    };
324
325    // Collect entries from both trees
326    let mut old_entries: HashMap<String, (gix::ObjectId, EntryKind)> = HashMap::new();
327    let mut new_entries: HashMap<String, (gix::ObjectId, EntryKind)> = HashMap::new();
328
329    // Recursively collect entries from parent tree
330    if let Some(ref tree) = parent_tree {
331        collect_tree_entries(repo, tree, String::new(), &mut old_entries)?;
332    }
333
334    // Recursively collect entries from commit tree
335    collect_tree_entries(repo, &commit_tree, String::new(), &mut new_entries)?;
336
337    // Find added and modified files
338    for (path, (new_oid, new_kind)) in &new_entries {
339        if *new_kind != EntryKind::Blob {
340            continue; // Skip non-blob entries (directories, submodules, etc.)
341        }
342
343        if let Some((old_oid, _)) = old_entries.get(path) {
344            // File exists in both trees
345            if old_oid != new_oid {
346                // Modified
347                let hunks = get_blob_diff(repo, Some(*old_oid), *new_oid)?;
348                file_changes.push(FileChange {
349                    path: path.clone(),
350                    old_path: None,
351                    change_type: ChangeType::Modified,
352                    old_blob_sha: Some(old_oid.to_hex().to_string()),
353                    new_blob_sha: Some(new_oid.to_hex().to_string()),
354                    hunks,
355                });
356            }
357            // else: unchanged, skip
358        } else {
359            // Added
360            let hunks = get_blob_diff(repo, None, *new_oid)?;
361            file_changes.push(FileChange {
362                path: path.clone(),
363                old_path: None,
364                change_type: ChangeType::Added,
365                old_blob_sha: None,
366                new_blob_sha: Some(new_oid.to_hex().to_string()),
367                hunks,
368            });
369        }
370    }
371
372    // Find deleted files
373    for (path, (old_oid, old_kind)) in &old_entries {
374        if *old_kind != EntryKind::Blob {
375            continue;
376        }
377
378        if !new_entries.contains_key(path) {
379            let hunks = get_blob_diff(
380                repo,
381                Some(*old_oid),
382                gix::ObjectId::empty_blob(gix::hash::Kind::Sha1),
383            )?;
384            file_changes.push(FileChange {
385                path: path.clone(),
386                old_path: None,
387                change_type: ChangeType::Deleted,
388                old_blob_sha: Some(old_oid.to_hex().to_string()),
389                new_blob_sha: None,
390                hunks,
391            });
392        }
393    }
394
395    Ok(file_changes)
396}
397
398/// Recursively collect all blob entries from a tree.
399fn collect_tree_entries(
400    repo: &Repository,
401    tree: &gix::Tree,
402    prefix: String,
403    entries: &mut HashMap<String, (gix::ObjectId, EntryKind)>,
404) -> Result<()> {
405    for entry in tree.iter() {
406        let entry = entry.map_err(|e| GitError::TreeReadError(e.to_string()))?;
407        let name = entry.filename().to_string();
408        let path = if prefix.is_empty() {
409            name.clone()
410        } else {
411            format!("{}/{}", prefix, name)
412        };
413
414        let mode = entry.mode();
415        let kind = mode.kind();
416        let oid = entry.oid();
417
418        match kind {
419            EntryKind::Blob | EntryKind::BlobExecutable => {
420                entries.insert(path, (oid.into(), kind));
421            }
422            EntryKind::Tree => {
423                // Recurse into subdirectory
424                if let Ok(subtree_obj) = repo.find_object(oid) {
425                    if let Ok(subtree) = subtree_obj.try_into_tree() {
426                        collect_tree_entries(repo, &subtree, path, entries)?;
427                    }
428                }
429            }
430            _ => {
431                // Skip links, submodules, etc.
432            }
433        }
434    }
435    Ok(())
436}
437
438/// Generate diff hunks between two blobs.
439fn get_blob_diff(
440    repo: &Repository,
441    old_oid: Option<gix::ObjectId>,
442    new_oid: gix::ObjectId,
443) -> Result<Vec<DiffHunk>> {
444    // Get old content
445    let old_content = if let Some(oid) = old_oid {
446        if oid == gix::ObjectId::empty_blob(gix::hash::Kind::Sha1) {
447            String::new()
448        } else {
449            match repo.find_object(oid) {
450                Ok(obj) => {
451                    let data = obj.data.clone();
452                    // Check size limit
453                    if data.len() as u64 > MAX_FILE_SIZE {
454                        return Ok(vec![]);
455                    }
456                    // Check if binary
457                    if is_binary(&data) {
458                        return Ok(vec![]);
459                    }
460                    String::from_utf8_lossy(&data).to_string()
461                }
462                Err(_) => String::new(),
463            }
464        }
465    } else {
466        String::new()
467    };
468
469    // Get new content
470    let new_content = if new_oid == gix::ObjectId::empty_blob(gix::hash::Kind::Sha1) {
471        String::new()
472    } else {
473        match repo.find_object(new_oid) {
474            Ok(obj) => {
475                let data = obj.data.clone();
476                // Check size limit
477                if data.len() as u64 > MAX_FILE_SIZE {
478                    return Ok(vec![]);
479                }
480                // Check if binary
481                if is_binary(&data) {
482                    return Ok(vec![]);
483                }
484                String::from_utf8_lossy(&data).to_string()
485            }
486            Err(_) => String::new(),
487        }
488    };
489
490    // Generate unified diff
491    let input = imara_diff::intern::InternedInput::new(old_content.as_str(), new_content.as_str());
492    let diff_output = diff(
493        Algorithm::Histogram,
494        &input,
495        UnifiedDiffBuilder::new(&input),
496    );
497
498    // Parse the unified diff to extract hunks
499    let hunks = parse_unified_diff(&diff_output);
500
501    Ok(hunks)
502}
503
504/// Check if content is binary (contains null bytes in first 8KB).
505fn is_binary(data: &[u8]) -> bool {
506    let check_len = std::cmp::min(data.len(), 8192);
507    data[..check_len].contains(&0)
508}
509
510/// Parse unified diff output into structured hunks.
511fn parse_unified_diff(diff_text: &str) -> Vec<DiffHunk> {
512    let mut hunks = Vec::new();
513    let mut current_hunk: Option<DiffHunk> = None;
514    let mut content_lines: Vec<String> = Vec::new();
515
516    for line in diff_text.lines() {
517        if line.starts_with("@@") {
518            // Save previous hunk
519            if let Some(mut hunk) = current_hunk.take() {
520                hunk.content = content_lines.join("\n");
521                hunks.push(hunk);
522                content_lines.clear();
523            }
524
525            // Parse hunk header: @@ -old_start,old_lines +new_start,new_lines @@
526            if let Some((old_start, old_lines, new_start, new_lines)) = parse_hunk_header(line) {
527                current_hunk = Some(DiffHunk {
528                    old_start,
529                    old_lines,
530                    new_start,
531                    new_lines,
532                    content: String::new(),
533                });
534            }
535        } else if current_hunk.is_some() {
536            // Content line (starts with +, -, or space)
537            content_lines.push(line.to_string());
538        }
539    }
540
541    // Save last hunk
542    if let Some(mut hunk) = current_hunk.take() {
543        hunk.content = content_lines.join("\n");
544        hunks.push(hunk);
545    }
546
547    hunks
548}
549
550/// Parse a hunk header line: @@ -old_start,old_lines +new_start,new_lines @@
551fn parse_hunk_header(line: &str) -> Option<(u32, u32, u32, u32)> {
552    let line = line.trim_start_matches("@@").trim_end_matches("@@").trim();
553    let parts: Vec<&str> = line.split_whitespace().collect();
554
555    if parts.len() < 2 {
556        return None;
557    }
558
559    let old_part = parts[0].trim_start_matches('-');
560    let new_part = parts[1].trim_start_matches('+');
561
562    let (old_start, old_lines) = parse_range(old_part)?;
563    let (new_start, new_lines) = parse_range(new_part)?;
564
565    Some((old_start, old_lines, new_start, new_lines))
566}
567
568/// Parse a range like "10,5" or "10" into (start, lines).
569fn parse_range(s: &str) -> Option<(u32, u32)> {
570    if let Some((start, lines)) = s.split_once(',') {
571        Some((start.parse().ok()?, lines.parse().ok()?))
572    } else {
573        Some((s.parse().ok()?, 1))
574    }
575}