Skip to main content

enya_analyzer/
repo.rs

1//! Git repository operations for codebase integration.
2//!
3//! Handles cloning repositories and fetching updates using system git commands.
4
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::sync::Arc;
8use std::sync::atomic::{AtomicUsize, Ordering};
9
10use rayon::prelude::*;
11
12/// Error type for repository operations.
13#[derive(Debug)]
14pub struct RepoError(pub String);
15
16impl std::fmt::Display for RepoError {
17    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
18        write!(f, "{}", self.0)
19    }
20}
21
22impl std::error::Error for RepoError {}
23
24impl From<std::io::Error> for RepoError {
25    fn from(e: std::io::Error) -> Self {
26        Self(format!("IO error: {e}"))
27    }
28}
29
30/// Returns the directory where repositories are stored.
31///
32/// Uses `~/.enya/repos/` as the base directory.
33#[must_use]
34pub fn repos_dir() -> Option<PathBuf> {
35    std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".enya").join("repos"))
36}
37
38/// Extracts a repository name from a git URL.
39///
40/// # Examples
41///
42/// ```
43/// use enya_analyzer::repo::repo_name_from_url;
44///
45/// assert_eq!(repo_name_from_url("https://github.com/org/repo.git"), "repo");
46/// assert_eq!(repo_name_from_url("git@github.com:org/repo.git"), "repo");
47/// assert_eq!(repo_name_from_url("https://github.com/org/repo"), "repo");
48/// ```
49#[must_use]
50pub fn repo_name_from_url(url: &str) -> String {
51    // Remove trailing .git if present
52    let url = url.strip_suffix(".git").unwrap_or(url);
53
54    // Get the last path component
55    url.rsplit('/')
56        .next()
57        .or_else(|| url.rsplit(':').next())
58        .unwrap_or("repo")
59        .to_string()
60}
61
62/// Clones a repository from the given URL.
63///
64/// The repository is cloned to `~/.enya/repos/<repo-name>/`.
65/// Returns the path to the cloned repository.
66///
67/// # Errors
68///
69/// Returns an error if cloning fails.
70pub fn clone_repo(url: &str) -> Result<PathBuf, RepoError> {
71    let Some(base_dir) = repos_dir() else {
72        return Err(RepoError("Could not determine home directory".to_string()));
73    };
74
75    // Ensure base directory exists
76    std::fs::create_dir_all(&base_dir)?;
77
78    let repo_name = repo_name_from_url(url);
79    let repo_path = base_dir.join(&repo_name);
80
81    // If repo already exists, ensure it has full history then return
82    if repo_path.exists() {
83        unshallow_if_needed(&repo_path)?;
84        return Ok(repo_path);
85    }
86
87    // Clone using git command (full history for commit indexing)
88    let output = Command::new("git")
89        .args(["clone", url])
90        .arg(&repo_path)
91        .output()
92        .map_err(|e| RepoError(format!("Failed to run git clone: {e}")))?;
93
94    if !output.status.success() {
95        let stderr = String::from_utf8_lossy(&output.stderr);
96        return Err(RepoError(format!("git clone failed: {stderr}")));
97    }
98
99    Ok(repo_path)
100}
101
102/// Fetches updates for an existing repository.
103///
104/// Returns `true` if there were remote changes.
105///
106/// # Errors
107///
108/// Returns an error if fetching fails.
109pub fn fetch_updates(repo_path: &Path) -> Result<bool, RepoError> {
110    // Get current HEAD
111    let head_before = get_head_commit(repo_path)?;
112
113    // Fetch from remote
114    let output = Command::new("git")
115        .args(["fetch", "origin"])
116        .current_dir(repo_path)
117        .output()
118        .map_err(|e| RepoError(format!("Failed to run git fetch: {e}")))?;
119
120    if !output.status.success() {
121        let stderr = String::from_utf8_lossy(&output.stderr);
122        return Err(RepoError(format!("git fetch failed: {stderr}")));
123    }
124
125    // Pull changes
126    let output = Command::new("git")
127        .args(["pull", "--ff-only"])
128        .current_dir(repo_path)
129        .output()
130        .map_err(|e| RepoError(format!("Failed to run git pull: {e}")))?;
131
132    if !output.status.success() {
133        let stderr = String::from_utf8_lossy(&output.stderr);
134        return Err(RepoError(format!("git pull failed: {stderr}")));
135    }
136
137    // Check if HEAD changed
138    let head_after = get_head_commit(repo_path)?;
139    Ok(head_before != head_after)
140}
141
142/// Converts a shallow clone to full history if needed.
143///
144/// This enables commit indexing on repositories that were previously cloned
145/// with `--depth 1`.
146fn unshallow_if_needed(repo_path: &Path) -> Result<(), RepoError> {
147    // Check if this is a shallow clone
148    let output = Command::new("git")
149        .args(["rev-parse", "--is-shallow-repository"])
150        .current_dir(repo_path)
151        .output()
152        .map_err(|e| RepoError(format!("Failed to check if shallow: {e}")))?;
153
154    let is_shallow = String::from_utf8_lossy(&output.stdout)
155        .trim()
156        .eq_ignore_ascii_case("true");
157
158    if !is_shallow {
159        return Ok(());
160    }
161
162    log::info!(
163        "Converting shallow clone to full history: {}",
164        repo_path.display()
165    );
166
167    // Fetch full history
168    let output = Command::new("git")
169        .args(["fetch", "--unshallow"])
170        .current_dir(repo_path)
171        .output()
172        .map_err(|e| RepoError(format!("Failed to unshallow repository: {e}")))?;
173
174    if !output.status.success() {
175        let stderr = String::from_utf8_lossy(&output.stderr);
176        return Err(RepoError(format!("git fetch --unshallow failed: {stderr}")));
177    }
178
179    log::info!("Successfully unshallowed repository");
180    Ok(())
181}
182
183/// Gets the current HEAD commit hash.
184///
185/// # Errors
186///
187/// Returns an error if the git command fails or the repository is invalid.
188pub fn get_head_commit(repo_path: &Path) -> Result<String, RepoError> {
189    let output = Command::new("git")
190        .args(["rev-parse", "HEAD"])
191        .current_dir(repo_path)
192        .output()
193        .map_err(|e| RepoError(format!("Failed to run git rev-parse: {e}")))?;
194
195    if !output.status.success() {
196        let stderr = String::from_utf8_lossy(&output.stderr);
197        return Err(RepoError(format!("git rev-parse failed: {stderr}")));
198    }
199
200    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
201}
202
203/// Gets the current HEAD commit message (subject line only).
204///
205/// # Errors
206///
207/// Returns an error if the git command fails or the repository is invalid.
208pub fn get_head_commit_message(repo_path: &Path) -> Result<String, RepoError> {
209    let output = Command::new("git")
210        .args(["log", "-1", "--format=%s"])
211        .current_dir(repo_path)
212        .output()
213        .map_err(|e| RepoError(format!("Failed to run git log: {e}")))?;
214
215    if !output.status.success() {
216        let stderr = String::from_utf8_lossy(&output.stderr);
217        return Err(RepoError(format!("git log failed: {stderr}")));
218    }
219
220    Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
221}
222
223/// Information about a git commit.
224#[derive(Debug, Clone, PartialEq, Eq, Default)]
225pub struct CommitInfo {
226    /// Full git commit hash
227    pub hash: String,
228    /// Commit timestamp in Unix seconds
229    pub timestamp: i64,
230    /// Commit message (subject line)
231    pub message: String,
232    /// Files changed in this commit (relative paths)
233    pub files_changed: Vec<String>,
234    /// Raw diff content (truncated if too large)
235    pub diff: String,
236    /// Semantic information extracted from the diff
237    pub semantics: DiffSemantics,
238}
239
240/// Semantic information extracted from a diff using Tree-sitter.
241#[derive(Debug, Clone, PartialEq, Eq, Default)]
242pub struct DiffSemantics {
243    /// Function names that were added
244    pub functions_added: Vec<String>,
245    /// Function names that were removed
246    pub functions_removed: Vec<String>,
247    /// Function names that were modified (had changes in their body)
248    pub functions_modified: Vec<String>,
249    /// Metric names that were added or modified (e.g., `counter.inc()`, `histogram.observe()`)
250    pub metrics_added: Vec<String>,
251    /// Metric names that were removed
252    pub metrics_removed: Vec<String>,
253    /// Import statements added
254    pub imports_added: Vec<String>,
255    /// Import statements removed
256    pub imports_removed: Vec<String>,
257}
258
259/// Fetches commit history for a repository within a time range.
260///
261/// Returns commits between `start_secs` and `end_secs` (Unix timestamps).
262/// Commits are returned in reverse chronological order (newest first).
263///
264/// # Errors
265///
266/// Returns an error if the git command fails.
267pub fn fetch_commit_history(
268    repo_path: &Path,
269    start_secs: i64,
270    end_secs: i64,
271) -> Result<Vec<CommitInfo>, RepoError> {
272    let output = Command::new("git")
273        .args([
274            "log",
275            &format!("--after=@{start_secs}"),
276            &format!("--before=@{end_secs}"),
277            "--format=%H|%ct|%s",
278        ])
279        .current_dir(repo_path)
280        .output()
281        .map_err(|e| RepoError(format!("Failed to run git log: {e}")))?;
282
283    if !output.status.success() {
284        let stderr = String::from_utf8_lossy(&output.stderr);
285        return Err(RepoError(format!("git log failed: {stderr}")));
286    }
287
288    let stdout = String::from_utf8_lossy(&output.stdout);
289    parse_git_log_output(&stdout)
290}
291
292/// Counts the total number of commits in the repository.
293///
294/// This is a fast operation that doesn't fetch commit data.
295///
296/// # Errors
297///
298/// Returns an error if the git command fails.
299pub fn count_commits(repo_path: &Path) -> Result<usize, RepoError> {
300    let output = Command::new("git")
301        .args(["rev-list", "--count", "HEAD"])
302        .current_dir(repo_path)
303        .output()
304        .map_err(|e| RepoError(format!("Failed to run git rev-list --count: {e}")))?;
305
306    if !output.status.success() {
307        let stderr = String::from_utf8_lossy(&output.stderr);
308        return Err(RepoError(format!("git rev-list --count failed: {stderr}")));
309    }
310
311    let count_str = String::from_utf8_lossy(&output.stdout);
312    count_str
313        .trim()
314        .parse::<usize>()
315        .map_err(|e| RepoError(format!("Failed to parse commit count: {e}")))
316}
317
318/// Fetches all commits for indexing purposes.
319///
320/// Returns all commits in reverse chronological order (newest first).
321/// This is used for building the complete search index.
322/// Includes the list of files changed in each commit.
323///
324/// # Errors
325///
326/// Returns an error if the git command fails.
327pub fn fetch_all_commits(repo_path: &Path) -> Result<Vec<CommitInfo>, RepoError> {
328    // Use --name-only to get files changed, with a record separator to parse
329    // Format: hash|timestamp|message\n\nfile1\nfile2\n\n (commits separated by empty line)
330    let output = Command::new("git")
331        .args(["log", "--format=%H|%ct|%s", "--name-only"])
332        .current_dir(repo_path)
333        .output()
334        .map_err(|e| RepoError(format!("Failed to run git log: {e}")))?;
335
336    if !output.status.success() {
337        let stderr = String::from_utf8_lossy(&output.stderr);
338        return Err(RepoError(format!("git log failed: {stderr}")));
339    }
340
341    let stdout = String::from_utf8_lossy(&output.stdout);
342    Ok(parse_git_log_with_files(&stdout))
343}
344
345/// Fetches recent commits for indexing purposes.
346///
347/// Returns up to `limit` commits in reverse chronological order (newest first).
348/// This is used for building the search index, not for time-range queries.
349/// Includes the list of files changed in each commit.
350///
351/// # Errors
352///
353/// Returns an error if the git command fails.
354pub fn fetch_recent_commits(repo_path: &Path, limit: usize) -> Result<Vec<CommitInfo>, RepoError> {
355    // Use --name-only to get files changed, with a record separator to parse
356    // Format: hash|timestamp|message\n\nfile1\nfile2\n\n (commits separated by empty line)
357    let output = Command::new("git")
358        .args([
359            "log",
360            &format!("-{limit}"),
361            "--format=%H|%ct|%s",
362            "--name-only",
363        ])
364        .current_dir(repo_path)
365        .output()
366        .map_err(|e| RepoError(format!("Failed to run git log: {e}")))?;
367
368    if !output.status.success() {
369        let stderr = String::from_utf8_lossy(&output.stderr);
370        return Err(RepoError(format!("git log failed: {stderr}")));
371    }
372
373    let stdout = String::from_utf8_lossy(&output.stdout);
374    Ok(parse_git_log_with_files(&stdout))
375}
376
377/// Maximum diff size to store per commit (64KB).
378/// Larger diffs are truncated to avoid bloating the index.
379const MAX_DIFF_SIZE: usize = 64 * 1024;
380
381/// Fetches the diff for a single commit.
382///
383/// Returns the unified diff output for the commit. Large diffs are truncated
384/// to `MAX_DIFF_SIZE` bytes.
385///
386/// # Errors
387///
388/// Returns an error if the git command fails.
389pub fn fetch_commit_diff(repo_path: &Path, commit_hash: &str) -> Result<String, RepoError> {
390    let output = Command::new("git")
391        .args([
392            "show",
393            commit_hash,
394            "--format=",   // Skip the commit message header
395            "--unified=3", // 3 lines of context
396            "-p",          // Show patch
397        ])
398        .current_dir(repo_path)
399        .output()
400        .map_err(|e| RepoError(format!("Failed to run git show: {e}")))?;
401
402    if !output.status.success() {
403        let stderr = String::from_utf8_lossy(&output.stderr);
404        return Err(RepoError(format!("git show failed: {stderr}")));
405    }
406
407    let diff = String::from_utf8_lossy(&output.stdout);
408
409    // Truncate if too large
410    if diff.len() > MAX_DIFF_SIZE {
411        Ok(format!(
412            "{}\n\n[... diff truncated, {} bytes total ...]",
413            &diff[..MAX_DIFF_SIZE],
414            diff.len()
415        ))
416    } else {
417        Ok(diff.into_owned())
418    }
419}
420
421/// Fetches recent commits with their diffs for full indexing.
422///
423/// This is more expensive than `fetch_recent_commits` as it fetches the
424/// full diff for each commit. Use sparingly for indexing purposes.
425///
426/// # Errors
427///
428/// Returns an error if the git command fails.
429pub fn fetch_recent_commits_with_diffs(
430    repo_path: &Path,
431    limit: usize,
432) -> Result<Vec<CommitInfo>, RepoError> {
433    // First get the basic commit info
434    let mut commits = fetch_recent_commits(repo_path, limit)?;
435
436    // Then fetch diffs for each commit and extract semantics
437    for commit in &mut commits {
438        match fetch_commit_diff(repo_path, &commit.hash) {
439            Ok(diff) => {
440                // Extract semantic information from the diff
441                commit.semantics = crate::diff::extract_semantics(&diff);
442                commit.diff = diff;
443            }
444            Err(e) => {
445                log::warn!("Failed to fetch diff for {}: {e}", &commit.hash[..8]);
446                // Continue without diff - better to have partial data
447            }
448        }
449    }
450
451    Ok(commits)
452}
453
454/// Progress callback for commit fetching.
455///
456/// Called periodically during diff fetching with:
457/// - `current`: Number of commits processed so far
458/// - `total`: Total number of commits to process
459/// - `current_item`: Description of a recently processed commit
460pub type ProgressCallback = Box<dyn Fn(usize, usize, Option<&str>) + Send + Sync>;
461
462/// Unique delimiter used to separate commits in batch git log output.
463const COMMIT_DELIMITER: &str = "\n__ENYA_COMMIT_BOUNDARY__\n";
464
465/// Fetches all commits with their diffs in a single git command (batch mode).
466///
467/// This is significantly faster than fetching diffs individually because it
468/// avoids the overhead of spawning many git processes. Uses a single `git log -p`
469/// command to get all commits and diffs, then parses the output.
470///
471/// **Note:** This function skips merge commits (`--no-merges`) and only follows
472/// the main branch history (`--first-parent`), excluding feature branch commits.
473/// This reduces redundant diff processing and focuses on the mainline history.
474///
475/// Semantic extraction is still parallelized with rayon for CPU efficiency.
476///
477/// # Arguments
478///
479/// * `repo_path` - Path to the git repository
480/// * `since_commit` - If provided, only fetch commits after this SHA (for incremental indexing)
481/// * `progress` - Optional progress callback for UI updates
482///
483/// # Returns
484///
485/// A vector of `CommitInfo` with diffs and semantics populated.
486///
487/// # Errors
488///
489/// Returns an error if the git command fails.
490pub fn fetch_all_commits_with_diffs_batch(
491    repo_path: &Path,
492    since_commit: Option<&str>,
493    progress: Option<&ProgressCallback>,
494) -> Result<Vec<CommitInfo>, RepoError> {
495    // Build the git log command with patches
496    // Format: delimiter + hash|timestamp|subject + newline + diff
497    let format_arg = format!("{}%H|%ct|%s", COMMIT_DELIMITER.trim_start());
498
499    let mut args = vec![
500        "log".to_string(),
501        format!("--format={format_arg}"),
502        "-p".to_string(),             // Include patches (diffs)
503        "--unified=3".to_string(),    // 3 lines of context
504        "--no-merges".to_string(),    // Skip merge commits (large, redundant diffs)
505        "--first-parent".to_string(), // Follow only main branch, skip feature branch commits
506    ];
507    // NOTE: Do NOT use --name-only with -p, it overrides patch output!
508
509    // For incremental indexing: only get commits since the last indexed one
510    if let Some(since) = since_commit {
511        args.push(format!("{since}..HEAD"));
512    }
513
514    log::info!(
515        "Fetching commits with diffs in batch mode from: {}{}",
516        repo_path.display(),
517        since_commit.map_or(String::new(), |s| format!(
518            " (since {})",
519            &s[..7.min(s.len())]
520        ))
521    );
522
523    let output = Command::new("git")
524        .args(&args)
525        .current_dir(repo_path)
526        .output()
527        .map_err(|e| RepoError(format!("Failed to run git log -p: {e}")))?;
528
529    if !output.status.success() {
530        let stderr = String::from_utf8_lossy(&output.stderr);
531        return Err(RepoError(format!("git log -p failed: {stderr}")));
532    }
533
534    let stdout = String::from_utf8_lossy(&output.stdout);
535
536    // Parse the batch output into commits with diffs
537    let mut commits = parse_batch_log_output(&stdout);
538    let total = commits.len();
539
540    if total == 0 {
541        log::info!("No commits to process");
542        return Ok(commits);
543    }
544
545    log::info!("Parsing diffs and extracting semantics for {total} commits");
546
547    // Phase 2: Extract semantics in parallel (CPU-bound, benefits from parallelization)
548    let processed = Arc::new(AtomicUsize::new(0));
549
550    commits.par_iter_mut().for_each(|commit| {
551        // Extract semantic information from the diff
552        commit.semantics = crate::diff::extract_semantics(&commit.diff);
553
554        // Truncate large diffs after semantic extraction
555        if commit.diff.len() > MAX_DIFF_SIZE {
556            let truncated_diff = format!(
557                "{}\n\n[... diff truncated, {} bytes total ...]",
558                &commit.diff[..MAX_DIFF_SIZE],
559                commit.diff.len()
560            );
561            commit.diff = truncated_diff;
562        }
563
564        // Update progress atomically
565        let count = processed.fetch_add(1, Ordering::Relaxed) + 1;
566
567        // Report progress periodically
568        if let Some(ref callback) = progress {
569            if count % 50 == 0 || count == total {
570                let short_hash = &commit.hash[..7.min(commit.hash.len())];
571                let first_line = commit.message.lines().next().unwrap_or("");
572                // Use char_indices to find a safe truncation boundary (Unicode-safe)
573                let truncated = if first_line.chars().count() > 35 {
574                    let boundary = first_line
575                        .char_indices()
576                        .nth(32)
577                        .map_or(first_line.len(), |(i, _)| i);
578                    format!("{}...", &first_line[..boundary])
579                } else {
580                    first_line.to_string()
581                };
582                let item_desc = format!("{short_hash} {truncated}");
583                callback(count, total, Some(&item_desc));
584            }
585        }
586    });
587
588    log::info!(
589        "Completed batch diff fetching for {} commits",
590        commits.len()
591    );
592
593    Ok(commits)
594}
595
596/// Parses the output of `git log -p` with our custom format.
597///
598/// The format uses `COMMIT_DELIMITER` to separate commits, with each commit
599/// having: hash|timestamp|subject followed by the diff.
600fn parse_batch_log_output(output: &str) -> Vec<CommitInfo> {
601    let mut commits = Vec::new();
602
603    // Split by our delimiter
604    for section in output.split(COMMIT_DELIMITER.trim()) {
605        let section = section.trim();
606        if section.is_empty() {
607            continue;
608        }
609
610        // First line is: hash|timestamp|subject
611        let mut lines = section.lines();
612        let Some(header) = lines.next() else {
613            continue;
614        };
615
616        // Parse header: hash|timestamp|subject
617        let mut parts = header.splitn(3, '|');
618        let Some(hash) = parts.next() else { continue };
619        let Some(timestamp_str) = parts.next() else {
620            continue;
621        };
622        let message = parts.next().unwrap_or("");
623
624        // Validate hash (should be 40 hex chars)
625        let hash = hash.trim();
626        if hash.len() < 40 || !hash.chars().all(|c| c.is_ascii_hexdigit()) {
627            continue;
628        }
629
630        let timestamp = timestamp_str.trim().parse::<i64>().unwrap_or(0);
631
632        // Rest is the diff (and file list from --name-only)
633        let diff: String = lines.collect::<Vec<_>>().join("\n");
634
635        // Extract files changed from the diff header lines
636        let files_changed = extract_files_from_diff(&diff);
637
638        commits.push(CommitInfo {
639            hash: hash.to_string(),
640            timestamp,
641            message: message.to_string(),
642            files_changed,
643            diff,
644            semantics: DiffSemantics::default(),
645        });
646    }
647
648    commits
649}
650
651/// Extracts file paths from a diff's header lines.
652fn extract_files_from_diff(diff: &str) -> Vec<String> {
653    let mut files = Vec::new();
654
655    for line in diff.lines() {
656        // Look for "diff --git a/path b/path" lines
657        if let Some(rest) = line.strip_prefix("diff --git a/") {
658            // Format: "path/to/file b/path/to/file"
659            if let Some(space_idx) = rest.find(" b/") {
660                let file_path = &rest[..space_idx];
661                if !files.contains(&file_path.to_string()) {
662                    files.push(file_path.to_string());
663                }
664            }
665        }
666    }
667
668    files
669}
670
671/// Fetches all commits with their diffs in parallel for full indexing.
672///
673/// This function fetches the complete git history and processes diffs
674/// in parallel using rayon for significant performance improvements.
675///
676/// **Note:** Consider using `fetch_all_commits_with_diffs_batch` instead,
677/// which is faster for large repositories as it uses a single git command.
678///
679/// # Arguments
680///
681/// * `repo_path` - Path to the git repository
682/// * `progress` - Optional progress callback for UI updates
683///
684/// # Returns
685///
686/// A vector of `CommitInfo` with diffs and semantics populated.
687///
688/// # Errors
689///
690/// Returns an error if fetching commit metadata fails.
691/// Individual diff fetch failures are logged but don't fail the operation.
692#[deprecated(
693    since = "0.1.0",
694    note = "Use fetch_all_commits_with_diffs_batch for better performance"
695)]
696pub fn fetch_all_commits_with_diffs_parallel(
697    repo_path: &Path,
698    progress: Option<&ProgressCallback>,
699) -> Result<Vec<CommitInfo>, RepoError> {
700    // Phase 1: Fetch commit metadata (fast - single git command)
701    let mut commits = fetch_all_commits(repo_path)?;
702
703    if commits.is_empty() {
704        return Ok(commits);
705    }
706
707    let total = commits.len();
708    log::info!(
709        "Fetching diffs for {} commits in parallel from: {}",
710        total,
711        repo_path.display()
712    );
713
714    // Atomic counter for progress tracking across threads
715    let processed = Arc::new(AtomicUsize::new(0));
716
717    // Phase 2: Fetch diffs and extract semantics in parallel
718    let repo_path = repo_path.to_path_buf();
719    commits.par_iter_mut().for_each(|commit| {
720        // Fetch diff for this commit
721        match fetch_commit_diff(&repo_path, &commit.hash) {
722            Ok(diff) => {
723                commit.semantics = crate::diff::extract_semantics(&diff);
724                commit.diff = diff;
725            }
726            Err(e) => {
727                log::warn!("Failed to fetch diff for {}: {e}", &commit.hash[..8]);
728            }
729        }
730
731        // Update progress atomically
732        let count = processed.fetch_add(1, Ordering::Relaxed) + 1;
733
734        // Report progress periodically (every 50 commits) to reduce callback overhead
735        if let Some(ref callback) = progress {
736            if count % 50 == 0 || count == total {
737                let short_hash = &commit.hash[..7.min(commit.hash.len())];
738                let first_line = commit.message.lines().next().unwrap_or("");
739                // Use char_indices to find a safe truncation boundary (Unicode-safe)
740                let truncated = if first_line.chars().count() > 35 {
741                    let boundary = first_line
742                        .char_indices()
743                        .nth(32)
744                        .map_or(first_line.len(), |(i, _)| i);
745                    format!("{}...", &first_line[..boundary])
746                } else {
747                    first_line.to_string()
748                };
749                let item_desc = format!("{short_hash} {truncated}");
750                callback(count, total, Some(&item_desc));
751            }
752        }
753    });
754
755    log::info!(
756        "Completed parallel diff fetching for {} commits",
757        commits.len()
758    );
759
760    Ok(commits)
761}
762
763/// Parses git log output in the format `hash|timestamp|message` (no files).
764fn parse_git_log_output(output: &str) -> Result<Vec<CommitInfo>, RepoError> {
765    let mut commits = Vec::new();
766
767    for line in output.lines() {
768        let line = line.trim();
769        if line.is_empty() {
770            continue;
771        }
772
773        // Split on first two pipes only (message may contain pipes)
774        let mut parts = line.splitn(3, '|');
775
776        let hash = parts
777            .next()
778            .ok_or_else(|| RepoError("Missing hash in git log output".to_string()))?
779            .to_string();
780
781        let timestamp_str = parts
782            .next()
783            .ok_or_else(|| RepoError("Missing timestamp in git log output".to_string()))?;
784
785        let timestamp = timestamp_str
786            .parse::<i64>()
787            .map_err(|e| RepoError(format!("Invalid timestamp '{timestamp_str}': {e}")))?;
788
789        let message = parts.next().unwrap_or("").to_string();
790
791        commits.push(CommitInfo {
792            hash,
793            timestamp,
794            message,
795            ..Default::default()
796        });
797    }
798
799    Ok(commits)
800}
801
802/// Parses git log output with `--name-only` format.
803///
804/// Format: Each commit starts with `hash|timestamp|message` followed by
805/// a blank line, then the list of files (one per line), then another blank line.
806fn parse_git_log_with_files(output: &str) -> Vec<CommitInfo> {
807    let mut commits = Vec::new();
808    let mut current_commit: Option<CommitInfo> = None;
809
810    for line in output.lines() {
811        let line = line.trim();
812
813        // Check if this line is a commit header (contains hash|timestamp|message)
814        if line.contains('|') && line.len() >= 40 {
815            // This looks like a commit header line
816            let mut parts = line.splitn(3, '|');
817
818            let hash = parts.next().unwrap_or("");
819            let timestamp_str = parts.next().unwrap_or("");
820            let message = parts.next().unwrap_or("");
821
822            // Validate it looks like a hash (40 hex chars)
823            if hash.len() >= 40 && hash.chars().all(|c| c.is_ascii_hexdigit()) {
824                // Save previous commit if any
825                if let Some(commit) = current_commit.take() {
826                    commits.push(commit);
827                }
828
829                let timestamp = timestamp_str.parse::<i64>().unwrap_or(0);
830
831                current_commit = Some(CommitInfo {
832                    hash: hash.to_string(),
833                    timestamp,
834                    message: message.to_string(),
835                    ..Default::default()
836                });
837                continue;
838            }
839        }
840
841        // Empty line or non-header line
842        if line.is_empty() {
843            continue;
844        }
845
846        // This is a file path - add to current commit
847        if let Some(ref mut commit) = current_commit {
848            commit.files_changed.push(line.to_string());
849        }
850    }
851
852    // Don't forget the last commit
853    if let Some(commit) = current_commit {
854        commits.push(commit);
855    }
856
857    commits
858}
859
860#[cfg(test)]
861mod tests {
862    use super::*;
863
864    #[test]
865    fn test_repo_name_from_url() {
866        assert_eq!(
867            repo_name_from_url("https://github.com/org/repo.git"),
868            "repo"
869        );
870        assert_eq!(repo_name_from_url("https://github.com/org/repo"), "repo");
871        assert_eq!(
872            repo_name_from_url("git@github.com:org/my-repo.git"),
873            "my-repo"
874        );
875        assert_eq!(
876            repo_name_from_url("https://gitlab.com/group/subgroup/project.git"),
877            "project"
878        );
879    }
880
881    #[test]
882    fn test_parse_git_log_output_single_commit() {
883        let output = "abc123def456|1700000000|Initial commit\n";
884        let commits = parse_git_log_output(output).expect("should parse");
885        assert_eq!(commits.len(), 1);
886        assert_eq!(commits[0].hash, "abc123def456");
887        assert_eq!(commits[0].timestamp, 1_700_000_000);
888        assert_eq!(commits[0].message, "Initial commit");
889    }
890
891    #[test]
892    fn test_parse_git_log_output_multiple_commits() {
893        let output = "\
894abc123|1700000000|First commit
895def456|1700001000|Second commit
896ghi789|1700002000|Third commit
897";
898        let commits = parse_git_log_output(output).expect("should parse");
899        assert_eq!(commits.len(), 3);
900        assert_eq!(commits[0].hash, "abc123");
901        assert_eq!(commits[1].hash, "def456");
902        assert_eq!(commits[2].hash, "ghi789");
903    }
904
905    #[test]
906    fn test_parse_git_log_output_message_with_pipes() {
907        let output = "abc123|1700000000|Fix bug | add feature | cleanup\n";
908        let commits = parse_git_log_output(output).expect("should parse");
909        assert_eq!(commits.len(), 1);
910        assert_eq!(commits[0].message, "Fix bug | add feature | cleanup");
911    }
912
913    #[test]
914    fn test_parse_git_log_output_empty() {
915        let output = "";
916        let commits = parse_git_log_output(output).expect("should parse");
917        assert!(commits.is_empty());
918    }
919
920    #[test]
921    fn test_parse_git_log_output_whitespace_only() {
922        let output = "  \n  \n  ";
923        let commits = parse_git_log_output(output).expect("should parse");
924        assert!(commits.is_empty());
925    }
926
927    #[test]
928    fn test_parse_git_log_output_empty_message() {
929        let output = "abc123|1700000000|\n";
930        let commits = parse_git_log_output(output).expect("should parse");
931        assert_eq!(commits.len(), 1);
932        assert_eq!(commits[0].message, "");
933    }
934
935    #[test]
936    fn test_commit_info_equality() {
937        let c1 = CommitInfo {
938            hash: "abc".to_string(),
939            timestamp: 1000,
940            message: "test".to_string(),
941            files_changed: vec!["file.rs".to_string()],
942            ..Default::default()
943        };
944        let c2 = CommitInfo {
945            hash: "abc".to_string(),
946            timestamp: 1000,
947            message: "test".to_string(),
948            files_changed: vec!["file.rs".to_string()],
949            ..Default::default()
950        };
951        assert_eq!(c1, c2);
952    }
953
954    #[test]
955    fn test_commit_info_clone() {
956        let c1 = CommitInfo {
957            hash: "abc".to_string(),
958            timestamp: 1000,
959            message: "test".to_string(),
960            files_changed: vec!["file.rs".to_string()],
961            ..Default::default()
962        };
963        let c2 = c1.clone();
964        assert_eq!(c1, c2);
965    }
966
967    #[test]
968    fn test_parse_git_log_with_files() {
969        // Git hashes are always 40 hex chars
970        let output = "\
971abc123def456789012345678901234567890abcd|1700000000|Add executor
972
973src/executor.rs
974src/lib.rs
975
976def456789012345678901234567890abcdef12ab|1700001000|Fix bug
977
978src/main.rs
979";
980        let commits = parse_git_log_with_files(output);
981        assert_eq!(commits.len(), 2);
982        assert_eq!(commits[0].hash, "abc123def456789012345678901234567890abcd");
983        assert_eq!(commits[0].message, "Add executor");
984        assert_eq!(
985            commits[0].files_changed,
986            vec!["src/executor.rs", "src/lib.rs"]
987        );
988        assert_eq!(commits[1].hash, "def456789012345678901234567890abcdef12ab");
989        assert_eq!(commits[1].message, "Fix bug");
990        assert_eq!(commits[1].files_changed, vec!["src/main.rs"]);
991    }
992
993    #[test]
994    fn test_parse_git_log_with_files_empty() {
995        let output = "";
996        let commits = parse_git_log_with_files(output);
997        assert!(commits.is_empty());
998    }
999
1000    #[test]
1001    fn test_parse_git_log_with_files_no_files() {
1002        // A commit with no file changes
1003        let output = "abc123def456789012345678901234567890abcd|1700000000|Empty commit\n";
1004        let commits = parse_git_log_with_files(output);
1005        assert_eq!(commits.len(), 1);
1006        assert!(commits[0].files_changed.is_empty());
1007    }
1008
1009    #[test]
1010    fn test_parse_batch_log_output_with_diff_content() {
1011        // Regression test: ensure batch parsing preserves actual diff content,
1012        // not just file names. The --name-only flag conflicts with -p and must
1013        // NOT be used together, otherwise diffs are empty.
1014        let output = "__ENYA_COMMIT_BOUNDARY__
1015abc123def456789012345678901234567890abcd|1700000000|Add new feature
1016
1017diff --git a/src/main.rs b/src/main.rs
1018index 1234567..abcdefg 100644
1019--- a/src/main.rs
1020+++ b/src/main.rs
1021@@ -1,3 +1,5 @@
1022 fn main() {
1023+    println!(\"Hello, world!\");
1024+    do_something();
1025 }
1026diff --git a/src/lib.rs b/src/lib.rs
1027index 2345678..bcdefgh 100644
1028--- a/src/lib.rs
1029+++ b/src/lib.rs
1030@@ -1 +1,3 @@
1031+pub fn do_something() {
1032+}
1033";
1034        let commits = parse_batch_log_output(output);
1035
1036        assert_eq!(commits.len(), 1);
1037        let commit = &commits[0];
1038
1039        // Verify metadata
1040        assert_eq!(commit.hash, "abc123def456789012345678901234567890abcd");
1041        assert_eq!(commit.timestamp, 1_700_000_000);
1042        assert_eq!(commit.message, "Add new feature");
1043
1044        // CRITICAL: Verify diff content is present, not just file names
1045        // This is the regression test - if --name-only is used with -p,
1046        // the diff would only contain file names without actual changes
1047        assert!(
1048            commit.diff.contains(r#"println!("Hello, world!");"#),
1049            "Diff should contain actual code changes, not just file names. Got: {}",
1050            &commit.diff[..200.min(commit.diff.len())]
1051        );
1052        assert!(
1053            commit.diff.contains("pub fn do_something()"),
1054            "Diff should contain function definition"
1055        );
1056        assert!(
1057            commit.diff.contains("@@ -1,3 +1,5 @@"),
1058            "Diff should contain hunk headers"
1059        );
1060
1061        // Verify files are extracted from diff headers
1062        assert_eq!(commit.files_changed.len(), 2);
1063        assert!(commit.files_changed.contains(&"src/main.rs".to_string()));
1064        assert!(commit.files_changed.contains(&"src/lib.rs".to_string()));
1065    }
1066
1067    #[test]
1068    fn test_parse_batch_log_output_multiple_commits() {
1069        let output = "__ENYA_COMMIT_BOUNDARY__
1070abc123def456789012345678901234567890abcd|1700000000|First commit
1071
1072diff --git a/file1.rs b/file1.rs
1073--- a/file1.rs
1074+++ b/file1.rs
1075@@ -1 +1,2 @@
1076+// added line
1077__ENYA_COMMIT_BOUNDARY__
1078def456789012345678901234567890abcdef1234|1700001000|Second commit
1079
1080diff --git a/file2.rs b/file2.rs
1081--- a/file2.rs
1082+++ b/file2.rs
1083@@ -1 +1,2 @@
1084+// another line
1085";
1086        let commits = parse_batch_log_output(output);
1087
1088        assert_eq!(commits.len(), 2);
1089        assert_eq!(commits[0].hash, "abc123def456789012345678901234567890abcd");
1090        assert_eq!(commits[0].message, "First commit");
1091        assert!(commits[0].diff.contains("// added line"));
1092
1093        assert_eq!(commits[1].hash, "def456789012345678901234567890abcdef1234");
1094        assert_eq!(commits[1].message, "Second commit");
1095        assert!(commits[1].diff.contains("// another line"));
1096    }
1097
1098    #[test]
1099    fn test_parse_batch_log_output_empty() {
1100        let commits = parse_batch_log_output("");
1101        assert!(commits.is_empty());
1102    }
1103
1104    #[test]
1105    fn test_extract_files_from_diff() {
1106        let diff = "diff --git a/src/main.rs b/src/main.rs
1107index 1234567..abcdefg 100644
1108--- a/src/main.rs
1109+++ b/src/main.rs
1110@@ -1,3 +1,5 @@
1111 fn main() {}
1112diff --git a/src/lib.rs b/src/lib.rs
1113--- a/src/lib.rs
1114+++ b/src/lib.rs
1115@@ -1 +1,2 @@
1116+pub fn foo() {}
1117";
1118        let files = extract_files_from_diff(diff);
1119        assert_eq!(files.len(), 2);
1120        assert_eq!(files[0], "src/main.rs");
1121        assert_eq!(files[1], "src/lib.rs");
1122    }
1123
1124    #[test]
1125    fn test_extract_files_from_diff_no_duplicates() {
1126        // Same file modified multiple times in one diff should only appear once
1127        let diff = "diff --git a/src/main.rs b/src/main.rs
1128--- a/src/main.rs
1129+++ b/src/main.rs
1130@@ -1,3 +1,5 @@
1131 fn main() {}
1132";
1133        let files = extract_files_from_diff(diff);
1134        assert_eq!(files.len(), 1);
1135        assert_eq!(files[0], "src/main.rs");
1136    }
1137}