scribe_scanner/
git_integration.rs

1//! Git integration for enhanced file discovery and status tracking.
2//!
3//! This module provides comprehensive Git integration capabilities including:
4//! - Fast file discovery using `git ls-files`
5//! - File status tracking (modified, staged, untracked)
6//! - Commit history and blame information
7//! - Repository statistics and health metrics
8
9use dashmap::DashMap;
10use scribe_core::{GitFileStatus, GitStatus, Result, ScribeError};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::path::{Path, PathBuf};
14use std::process::Command;
15use std::time::{SystemTime, UNIX_EPOCH};
16use tokio::process::Command as AsyncCommand;
17
18/// Git repository integration handler
19#[derive(Debug)]
20pub struct GitIntegrator {
21    repo_path: PathBuf,
22    git_available: bool,
23    cache: GitCache,
24}
25
26/// Git file information
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct GitFileInfo {
29    pub path: PathBuf,
30    pub status: GitFileStatus,
31    pub last_commit: Option<GitCommitInfo>,
32    pub blame_info: Option<GitBlameInfo>,
33    pub changes_count: usize,
34    pub additions: usize,
35    pub deletions: usize,
36}
37
38/// Git commit information
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct GitCommitInfo {
41    pub hash: String,
42    pub author: String,
43    pub email: String,
44    pub timestamp: u64,
45    pub message: String,
46    pub files_changed: usize,
47}
48
49/// Git blame information for a file
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct GitBlameInfo {
52    pub lines: Vec<GitBlameLine>,
53    pub contributors: HashMap<String, usize>, // author -> line count
54    pub last_modified: u64,
55    pub age_distribution: AgeDistribution,
56}
57
58/// Individual line blame information
59#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct GitBlameLine {
61    pub line_number: usize,
62    pub commit_hash: String,
63    pub author: String,
64    pub timestamp: u64,
65    pub content: String,
66}
67
68/// Age distribution of code lines
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct AgeDistribution {
71    pub recent: usize,   // < 1 month
72    pub moderate: usize, // 1-6 months
73    pub old: usize,      // 6-12 months
74    pub ancient: usize,  // > 1 year
75}
76
77/// Git repository statistics
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct GitRepositoryStats {
80    pub total_commits: usize,
81    pub contributors: Vec<ContributorStats>,
82    pub branches: Vec<String>,
83    pub tags: Vec<String>,
84    pub file_types: HashMap<String, usize>,
85    pub activity_timeline: Vec<ActivityPeriod>,
86    pub repository_health: RepositoryHealth,
87}
88
89/// Contributor statistics
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct ContributorStats {
92    pub name: String,
93    pub email: String,
94    pub commits: usize,
95    pub lines_added: usize,
96    pub lines_deleted: usize,
97    pub files_modified: usize,
98    pub first_commit: u64,
99    pub last_commit: u64,
100}
101
102/// Activity period statistics
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct ActivityPeriod {
105    pub period: String, // e.g., "2024-01", "2024-W15"
106    pub commits: usize,
107    pub lines_changed: usize,
108    pub files_touched: usize,
109    pub contributors: HashSet<String>,
110}
111
112/// Repository health metrics
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct RepositoryHealth {
115    pub commit_frequency: f64,       // commits per day
116    pub contributor_diversity: f64,  // number of active contributors
117    pub code_churn: f64,             // lines changed / lines total
118    pub documentation_ratio: f64,    // docs files / code files
119    pub test_coverage_estimate: f64, // test files / code files
120    pub branch_health: BranchHealth,
121}
122
123/// Branch health information
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub struct BranchHealth {
126    pub main_branch: String,
127    pub active_branches: usize,
128    pub stale_branches: usize,
129    pub merge_conflicts_risk: f64,
130}
131
132/// Git operations cache for performance
133#[derive(Debug)]
134struct GitCache {
135    file_statuses: DashMap<PathBuf, GitFileStatus>,
136    commit_cache: DashMap<String, GitCommitInfo>,
137    blame_cache: DashMap<PathBuf, GitBlameInfo>,
138    files_discovered: parking_lot::RwLock<usize>,
139    cache_timestamp: parking_lot::RwLock<Option<SystemTime>>,
140    cache_ttl: std::time::Duration,
141    batch_status_cache: DashMap<PathBuf, GitFileStatus>,
142}
143
144impl Default for GitCache {
145    fn default() -> Self {
146        Self {
147            file_statuses: DashMap::new(),
148            commit_cache: DashMap::new(),
149            blame_cache: DashMap::new(),
150            files_discovered: parking_lot::RwLock::new(0),
151            cache_timestamp: parking_lot::RwLock::new(None),
152            cache_ttl: std::time::Duration::from_secs(300),
153            batch_status_cache: DashMap::new(),
154        }
155    }
156}
157
158impl GitIntegrator {
159    /// Create a new Git integrator for the given repository path
160    pub fn new<P: AsRef<Path>>(repo_path: P) -> Result<Self> {
161        let repo_path = repo_path.as_ref().to_path_buf();
162
163        // Verify this is a Git repository
164        let git_dir = repo_path.join(".git");
165        if !git_dir.exists() {
166            return Err(ScribeError::git("Not a git repository".to_string()));
167        }
168
169        // Check if git command is available
170        let git_available = Command::new("git")
171            .arg("--version")
172            .output()
173            .map(|output| output.status.success())
174            .unwrap_or(false);
175
176        if !git_available {
177            log::warn!("Git command not available, falling back to filesystem scanning");
178        }
179
180        Ok(Self {
181            repo_path,
182            git_available,
183            cache: GitCache {
184                cache_ttl: std::time::Duration::from_secs(300), // 5 minutes
185                ..Default::default()
186            },
187        })
188    }
189
190    /// List all tracked files in the repository
191    pub async fn list_tracked_files(&self) -> Result<Vec<PathBuf>> {
192        if !self.git_available {
193            return Err(ScribeError::git("Git not available".to_string()));
194        }
195
196        let output = AsyncCommand::new("git")
197            .arg("ls-files")
198            .arg("-z") // null-separated output for safety
199            .current_dir(&self.repo_path)
200            .output()
201            .await
202            .map_err(|e| ScribeError::git(format!("Failed to run git ls-files: {}", e)))?;
203
204        if !output.status.success() {
205            let stderr = String::from_utf8_lossy(&output.stderr);
206            return Err(ScribeError::git(format!("git ls-files failed: {}", stderr)));
207        }
208
209        let stdout = String::from_utf8_lossy(&output.stdout);
210        let files: Vec<PathBuf> = stdout
211            .split('\0')
212            .filter(|s| !s.is_empty())
213            .map(|s| self.repo_path.join(s))
214            .collect();
215
216        // Update cache
217        *self.cache.files_discovered.write() = files.len();
218        *self.cache.cache_timestamp.write() = Some(SystemTime::now());
219
220        log::debug!("Git discovered {} tracked files", files.len());
221        Ok(files)
222    }
223
224    /// Load all file statuses in a single batch operation for better performance
225    pub async fn load_batch_file_statuses(&self) -> Result<()> {
226        if !self.git_available {
227            return Ok(());
228        }
229
230        let output = AsyncCommand::new("git")
231            .arg("status")
232            .arg("--porcelain")
233            .arg("-z") // null-separated output for safety
234            .current_dir(&self.repo_path)
235            .output()
236            .await
237            .map_err(|e| ScribeError::git(format!("Failed to get batch file status: {}", e)))?;
238
239        if !output.status.success() {
240            log::warn!("Git status failed, batch status unavailable");
241            return Ok(());
242        }
243
244        let stdout = String::from_utf8_lossy(&output.stdout);
245        let mut status_map: HashMap<String, Vec<char>> = HashMap::new();
246
247        for line in stdout.split('\0') {
248            if line.len() < 3 {
249                continue;
250            }
251
252            let status_code = &line[..2];
253            let file_path = &line[3..];
254
255            if file_path.is_empty() {
256                continue;
257            }
258
259            let status = match status_code {
260                " M" | "M " | "MM" => GitFileStatus::Modified,
261                "A " | " A" => GitFileStatus::Added,
262                "D " | " D" => GitFileStatus::Deleted,
263                "R " | " R" => GitFileStatus::Renamed,
264                "C " | " C" => GitFileStatus::Copied,
265                "??" => GitFileStatus::Untracked,
266                "!!" => GitFileStatus::Ignored,
267                _ => GitFileStatus::Unmodified,
268            };
269
270            let full_path = self.repo_path.join(file_path);
271            self.cache.batch_status_cache.insert(full_path, status);
272        }
273
274        *self.cache.cache_timestamp.write() = Some(SystemTime::now());
275
276        log::debug!(
277            "Loaded batch file statuses for {} files",
278            self.cache.batch_status_cache.len()
279        );
280
281        Ok(())
282    }
283
284    /// Get detailed file information including git status
285    pub async fn get_file_info(&self, file_path: &Path) -> Result<GitFileInfo> {
286        // Check cache first
287        if let Some(cached_status) = self.cache.file_statuses.get(file_path) {
288            if self.is_cache_valid() {
289                return Ok(GitFileInfo {
290                    path: file_path.to_path_buf(),
291                    status: cached_status.clone(),
292                    last_commit: None, // Would need to implement commit lookup
293                    blame_info: self
294                        .cache
295                        .blame_cache
296                        .get(file_path)
297                        .map(|entry| entry.clone()),
298                    changes_count: 0,
299                    additions: 0,
300                    deletions: 0,
301                });
302            }
303        }
304
305        let status = self.get_file_status(file_path).await?;
306        let last_commit = self.get_last_commit_for_file(file_path).await.ok();
307        let blame_info = self.get_blame_info(file_path).await.ok();
308
309        // Get file change statistics
310        let (changes_count, additions, deletions) = self
311            .get_file_change_stats(file_path)
312            .await
313            .unwrap_or((0, 0, 0));
314
315        // Cache the status and update timestamp
316        self.cache
317            .file_statuses
318            .insert(file_path.to_path_buf(), status.clone());
319        *self.cache.cache_timestamp.write() = Some(SystemTime::now());
320
321        Ok(GitFileInfo {
322            path: file_path.to_path_buf(),
323            status,
324            last_commit,
325            blame_info,
326            changes_count,
327            additions,
328            deletions,
329        })
330    }
331
332    /// Get the current status of a file
333    async fn get_file_status(&self, file_path: &Path) -> Result<GitFileStatus> {
334        if !self.git_available {
335            return Ok(GitFileStatus::Untracked);
336        }
337
338        // Try to use batch cache first
339        if !self.cache.batch_status_cache.is_empty() {
340            if let Some(status) = self.cache.batch_status_cache.get(file_path) {
341                return Ok(status.clone());
342            }
343            // File not found in status map means it's unmodified (tracked but clean)
344            return Ok(GitFileStatus::Unmodified);
345        }
346
347        // Fallback to individual git call if batch cache not available
348        let relative_path = file_path
349            .strip_prefix(&self.repo_path)
350            .map_err(|_| ScribeError::git("File not in repository".to_string()))?;
351
352        let output = AsyncCommand::new("git")
353            .arg("status")
354            .arg("--porcelain")
355            .arg(relative_path)
356            .current_dir(&self.repo_path)
357            .output()
358            .await
359            .map_err(|e| ScribeError::git(format!("Failed to get file status: {}", e)))?;
360
361        if !output.status.success() {
362            return Ok(GitFileStatus::Unmodified);
363        }
364
365        let stdout = String::from_utf8_lossy(&output.stdout);
366        let status = if stdout.is_empty() {
367            GitFileStatus::Unmodified
368        } else {
369            let status_code = stdout.chars().take(2).collect::<String>();
370            match status_code.as_str() {
371                " M" => GitFileStatus::Modified,
372                "M " => GitFileStatus::Modified,
373                "MM" => GitFileStatus::Modified, // Modified after staging
374                "A " => GitFileStatus::Added,
375                "D " => GitFileStatus::Deleted,
376                "R " => GitFileStatus::Renamed,
377                "C " => GitFileStatus::Copied,
378                "??" => GitFileStatus::Untracked,
379                "!!" => GitFileStatus::Ignored,
380                _ => GitFileStatus::Unmodified,
381            }
382        };
383
384        Ok(status)
385    }
386
387    /// Get the last commit information for a file
388    async fn get_last_commit_for_file(&self, file_path: &Path) -> Result<GitCommitInfo> {
389        if !self.git_available {
390            return Err(ScribeError::git("Git not available".to_string()));
391        }
392
393        let relative_path = file_path
394            .strip_prefix(&self.repo_path)
395            .map_err(|_| ScribeError::git("File not in repository".to_string()))?;
396
397        let output = AsyncCommand::new("git")
398            .arg("log")
399            .arg("-1")
400            .arg("--pretty=format:%H|%an|%ae|%at|%s|%H") // hash|author|email|timestamp|subject|hash_again
401            .arg("--")
402            .arg(relative_path)
403            .current_dir(&self.repo_path)
404            .output()
405            .await
406            .map_err(|e| ScribeError::git(format!("Failed to get commit info: {}", e)))?;
407
408        if !output.status.success() {
409            let stderr = String::from_utf8_lossy(&output.stderr);
410            return Err(ScribeError::git(format!("git log failed: {}", stderr)));
411        }
412
413        let stdout = String::from_utf8_lossy(&output.stdout);
414        let parts: Vec<&str> = stdout.trim().splitn(6, '|').collect();
415
416        if parts.len() < 5 {
417            return Err(ScribeError::git("Invalid git log output".to_string()));
418        }
419
420        let timestamp = parts[3]
421            .parse::<u64>()
422            .map_err(|_| ScribeError::git("Invalid timestamp".to_string()))?;
423
424        Ok(GitCommitInfo {
425            hash: parts[0].to_string(),
426            author: parts[1].to_string(),
427            email: parts[2].to_string(),
428            timestamp,
429            message: parts[4].to_string(),
430            files_changed: 1, // Would need additional command to get accurate count
431        })
432    }
433
434    /// Get blame information for a file
435    async fn get_blame_info(&self, file_path: &Path) -> Result<GitBlameInfo> {
436        if !self.git_available {
437            return Err(ScribeError::git("Git not available".to_string()));
438        }
439
440        // Check cache first
441        if let Some(cached_blame) = self.cache.blame_cache.get(file_path) {
442            if self.is_cache_valid() {
443                return Ok(cached_blame.clone());
444            }
445        }
446
447        let relative_path = file_path
448            .strip_prefix(&self.repo_path)
449            .map_err(|_| ScribeError::git("File not in repository".to_string()))?;
450
451        let output = AsyncCommand::new("git")
452            .arg("blame")
453            .arg("--porcelain")
454            .arg(relative_path)
455            .current_dir(&self.repo_path)
456            .output()
457            .await
458            .map_err(|e| ScribeError::git(format!("Failed to get blame info: {}", e)))?;
459
460        if !output.status.success() {
461            let stderr = String::from_utf8_lossy(&output.stderr);
462            return Err(ScribeError::git(format!("git blame failed: {}", stderr)));
463        }
464
465        let stdout = String::from_utf8_lossy(&output.stdout);
466        let blame_info = self.parse_blame_output(&stdout)?;
467
468        Ok(blame_info)
469    }
470
471    /// Parse git blame porcelain output
472    fn parse_blame_output(&self, blame_output: &str) -> Result<GitBlameInfo> {
473        let mut lines = Vec::new();
474        let mut contributors = HashMap::new();
475        let mut last_modified = 0u64;
476
477        let blame_lines: Vec<&str> = blame_output.lines().collect();
478        let mut i = 0;
479
480        while i < blame_lines.len() {
481            let line = blame_lines[i];
482            if line.is_empty() {
483                i += 1;
484                continue;
485            }
486
487            // Parse commit hash and line number from first line
488            let parts: Vec<&str> = line.split_whitespace().collect();
489            if parts.len() < 3 {
490                i += 1;
491                continue;
492            }
493
494            let commit_hash = parts[0].to_string();
495            let line_number = parts[2].parse::<usize>().unwrap_or(0);
496
497            // Parse additional information
498            let mut author = String::new();
499            let mut timestamp = 0u64;
500            let mut content = String::new();
501
502            i += 1;
503            while i < blame_lines.len() {
504                let info_line = blame_lines[i];
505                if info_line.starts_with("author ") {
506                    author = info_line[7..].to_string();
507                } else if info_line.starts_with("author-time ") {
508                    timestamp = info_line[12..].parse().unwrap_or(0);
509                    last_modified = last_modified.max(timestamp);
510                } else if info_line.starts_with('\t') {
511                    content = info_line[1..].to_string();
512                    break;
513                }
514                i += 1;
515            }
516
517            // Count lines per author
518            *contributors.entry(author.clone()).or_insert(0) += 1;
519
520            lines.push(GitBlameLine {
521                line_number,
522                commit_hash,
523                author,
524                timestamp,
525                content,
526            });
527
528            i += 1;
529        }
530
531        // Calculate age distribution
532        let now = SystemTime::now()
533            .duration_since(UNIX_EPOCH)
534            .unwrap()
535            .as_secs();
536
537        let mut age_distribution = AgeDistribution {
538            recent: 0,
539            moderate: 0,
540            old: 0,
541            ancient: 0,
542        };
543
544        for line in &lines {
545            let age_seconds = now.saturating_sub(line.timestamp);
546            let age_days = age_seconds / 86400; // seconds per day
547
548            match age_days {
549                0..=30 => age_distribution.recent += 1,
550                31..=180 => age_distribution.moderate += 1,
551                181..=365 => age_distribution.old += 1,
552                _ => age_distribution.ancient += 1,
553            }
554        }
555
556        Ok(GitBlameInfo {
557            lines,
558            contributors,
559            last_modified,
560            age_distribution,
561        })
562    }
563
564    /// Get file change statistics (additions/deletions count)
565    async fn get_file_change_stats(&self, file_path: &Path) -> Result<(usize, usize, usize)> {
566        if !self.git_available {
567            return Err(ScribeError::git("Git not available".to_string()));
568        }
569
570        let relative_path = file_path
571            .strip_prefix(&self.repo_path)
572            .map_err(|_| ScribeError::git("File not in repository".to_string()))?;
573
574        let output = AsyncCommand::new("git")
575            .arg("log")
576            .arg("--numstat")
577            .arg("--pretty=format:")
578            .arg("--")
579            .arg(relative_path)
580            .current_dir(&self.repo_path)
581            .output()
582            .await
583            .map_err(|e| ScribeError::git(format!("Failed to get change stats: {}", e)))?;
584
585        if !output.status.success() {
586            return Ok((0, 0, 0));
587        }
588
589        let stdout = String::from_utf8_lossy(&output.stdout);
590        let mut total_changes = 0;
591        let mut total_additions = 0;
592        let mut total_deletions = 0;
593
594        for line in stdout.lines() {
595            if line.trim().is_empty() {
596                continue;
597            }
598
599            let parts: Vec<&str> = line.split_whitespace().collect();
600            if parts.len() >= 2 {
601                if let (Ok(additions), Ok(deletions)) =
602                    (parts[0].parse::<usize>(), parts[1].parse::<usize>())
603                {
604                    total_additions += additions;
605                    total_deletions += deletions;
606                    total_changes += 1;
607                }
608            }
609        }
610
611        Ok((total_changes, total_additions, total_deletions))
612    }
613
614    /// Get comprehensive repository statistics
615    pub async fn get_repository_stats(&self) -> Result<GitRepositoryStats> {
616        if !self.git_available {
617            return Err(ScribeError::git("Git not available".to_string()));
618        }
619
620        let (total_commits, contributors) = self.get_contributor_stats().await?;
621        let branches = self.get_branches().await?;
622        let tags = self.get_tags().await?;
623        let file_types = self.analyze_file_types().await?;
624        let activity_timeline = self.get_activity_timeline().await?;
625        let repository_health = self
626            .calculate_repository_health(&contributors, &activity_timeline)
627            .await?;
628
629        Ok(GitRepositoryStats {
630            total_commits,
631            contributors,
632            branches,
633            tags,
634            file_types,
635            activity_timeline,
636            repository_health,
637        })
638    }
639
640    /// Get contributor statistics
641    async fn get_contributor_stats(&self) -> Result<(usize, Vec<ContributorStats>)> {
642        let output = AsyncCommand::new("git")
643            .arg("shortlog")
644            .arg("-sne")
645            .arg("--all")
646            .current_dir(&self.repo_path)
647            .output()
648            .await
649            .map_err(|e| ScribeError::git(format!("Failed to get contributors: {}", e)))?;
650
651        if !output.status.success() {
652            return Ok((0, vec![]));
653        }
654
655        let stdout = String::from_utf8_lossy(&output.stdout);
656        let mut contributors = Vec::new();
657        let mut total_commits = 0;
658
659        for line in stdout.lines() {
660            if let Some((count_str, name_email)) = line.trim().split_once('\t') {
661                if let Ok(commits) = count_str.trim().parse::<usize>() {
662                    total_commits += commits;
663
664                    // Parse name and email
665                    let (name, email) = if let Some((n, e)) = name_email.rsplit_once('<') {
666                        let email = e.trim_end_matches('>');
667                        (n.trim().to_string(), email.to_string())
668                    } else {
669                        (name_email.to_string(), String::new())
670                    };
671
672                    // Get additional stats for this contributor
673                    let (lines_added, lines_deleted, files_modified, first_commit, last_commit) =
674                        self.get_detailed_contributor_stats(&email)
675                            .await
676                            .unwrap_or((0, 0, 0, 0, 0));
677
678                    contributors.push(ContributorStats {
679                        name,
680                        email,
681                        commits,
682                        lines_added,
683                        lines_deleted,
684                        files_modified,
685                        first_commit,
686                        last_commit,
687                    });
688                }
689            }
690        }
691
692        // Sort by commit count descending
693        contributors.sort_by(|a, b| b.commits.cmp(&a.commits));
694
695        Ok((total_commits, contributors))
696    }
697
698    /// Get detailed statistics for a specific contributor
699    async fn get_detailed_contributor_stats(
700        &self,
701        email: &str,
702    ) -> Result<(usize, usize, usize, u64, u64)> {
703        let output = AsyncCommand::new("git")
704            .arg("log")
705            .arg("--author")
706            .arg(email)
707            .arg("--numstat")
708            .arg("--pretty=format:%at")
709            .current_dir(&self.repo_path)
710            .output()
711            .await
712            .map_err(|e| ScribeError::git(format!("Failed to get detailed stats: {}", e)))?;
713
714        if !output.status.success() {
715            return Ok((0, 0, 0, 0, 0));
716        }
717
718        let stdout = String::from_utf8_lossy(&output.stdout);
719        let mut lines_added = 0;
720        let mut lines_deleted = 0;
721        let mut files_modified = 0;
722        let mut timestamps = Vec::new();
723
724        for line in stdout.lines() {
725            if line.trim().is_empty() {
726                continue;
727            }
728
729            // Check if it's a timestamp line
730            if let Ok(timestamp) = line.parse::<u64>() {
731                timestamps.push(timestamp);
732                continue;
733            }
734
735            // Check if it's a numstat line
736            let parts: Vec<&str> = line.split_whitespace().collect();
737            if parts.len() >= 3 {
738                if let (Ok(added), Ok(deleted)) =
739                    (parts[0].parse::<usize>(), parts[1].parse::<usize>())
740                {
741                    lines_added += added;
742                    lines_deleted += deleted;
743                    files_modified += 1;
744                }
745            }
746        }
747
748        let first_commit = timestamps.iter().min().copied().unwrap_or(0);
749        let last_commit = timestamps.iter().max().copied().unwrap_or(0);
750
751        Ok((
752            lines_added,
753            lines_deleted,
754            files_modified,
755            first_commit,
756            last_commit,
757        ))
758    }
759
760    /// Get list of branches
761    async fn get_branches(&self) -> Result<Vec<String>> {
762        let output = AsyncCommand::new("git")
763            .arg("branch")
764            .arg("-a")
765            .current_dir(&self.repo_path)
766            .output()
767            .await
768            .map_err(|e| ScribeError::git(format!("Failed to get branches: {}", e)))?;
769
770        if !output.status.success() {
771            return Ok(vec![]);
772        }
773
774        let stdout = String::from_utf8_lossy(&output.stdout);
775        let branches = stdout
776            .lines()
777            .map(|line| line.trim_start_matches("* ").trim())
778            .filter(|line| !line.is_empty())
779            .map(|line| line.to_string())
780            .collect();
781
782        Ok(branches)
783    }
784
785    /// Get list of tags
786    async fn get_tags(&self) -> Result<Vec<String>> {
787        let output = AsyncCommand::new("git")
788            .arg("tag")
789            .current_dir(&self.repo_path)
790            .output()
791            .await
792            .map_err(|e| ScribeError::git(format!("Failed to get tags: {}", e)))?;
793
794        if !output.status.success() {
795            return Ok(vec![]);
796        }
797
798        let stdout = String::from_utf8_lossy(&output.stdout);
799        let tags = stdout
800            .lines()
801            .filter(|line| !line.trim().is_empty())
802            .map(|line| line.trim().to_string())
803            .collect();
804
805        Ok(tags)
806    }
807
808    /// Analyze file types in the repository
809    async fn analyze_file_types(&self) -> Result<HashMap<String, usize>> {
810        let files = self.list_tracked_files().await?;
811        let mut file_types = HashMap::new();
812
813        for file in files {
814            if let Some(extension) = file.extension().and_then(|ext| ext.to_str()) {
815                *file_types.entry(extension.to_string()).or_insert(0) += 1;
816            } else {
817                *file_types.entry("no_extension".to_string()).or_insert(0) += 1;
818            }
819        }
820
821        Ok(file_types)
822    }
823
824    /// Get activity timeline
825    async fn get_activity_timeline(&self) -> Result<Vec<ActivityPeriod>> {
826        // This would implement more sophisticated timeline analysis
827        // For now, returning empty vector as placeholder
828        Ok(vec![])
829    }
830
831    /// Calculate repository health metrics
832    async fn calculate_repository_health(
833        &self,
834        contributors: &[ContributorStats],
835        activity_timeline: &[ActivityPeriod],
836    ) -> Result<RepositoryHealth> {
837        // Calculate basic health metrics
838        let commit_frequency = if !activity_timeline.is_empty() {
839            let total_commits: usize = activity_timeline.iter().map(|p| p.commits).sum();
840            total_commits as f64 / activity_timeline.len() as f64
841        } else {
842            0.0
843        };
844
845        let contributor_diversity = contributors.len() as f64;
846
847        // Basic code churn calculation (would need more sophisticated analysis)
848        let total_added: usize = contributors.iter().map(|c| c.lines_added).sum();
849        let total_deleted: usize = contributors.iter().map(|c| c.lines_deleted).sum();
850        let code_churn = if total_added > 0 {
851            total_deleted as f64 / total_added as f64
852        } else {
853            0.0
854        };
855
856        // Placeholder values for other metrics
857        let documentation_ratio = 0.0;
858        let test_coverage_estimate = 0.0;
859
860        let branch_health = BranchHealth {
861            main_branch: "main".to_string(),
862            active_branches: 1,
863            stale_branches: 0,
864            merge_conflicts_risk: 0.0,
865        };
866
867        Ok(RepositoryHealth {
868            commit_frequency,
869            contributor_diversity,
870            code_churn,
871            documentation_ratio,
872            test_coverage_estimate,
873            branch_health,
874        })
875    }
876
877    /// Check if cache is still valid
878    fn is_cache_valid(&self) -> bool {
879        if let Some(cache_time) = *self.cache.cache_timestamp.read() {
880            SystemTime::now()
881                .duration_since(cache_time)
882                .map(|duration| duration < self.cache.cache_ttl)
883                .unwrap_or(false)
884        } else {
885            false
886        }
887    }
888
889    /// Clear all caches
890    pub fn clear_cache(&self) {
891        self.cache.file_statuses.clear();
892        self.cache.commit_cache.clear();
893        self.cache.blame_cache.clear();
894        self.cache.batch_status_cache.clear();
895        *self.cache.cache_timestamp.write() = None;
896    }
897
898    /// Get number of files discovered through git
899    pub fn files_discovered(&self) -> usize {
900        *self.cache.files_discovered.read()
901    }
902
903    /// Check if git is available
904    pub fn is_git_available(&self) -> bool {
905        self.git_available
906    }
907
908    /// Get repository root path
909    pub fn repo_path(&self) -> &Path {
910        &self.repo_path
911    }
912}
913
914/// Git diff entry representing a single change
915#[derive(Debug, Clone, Serialize, Deserialize)]
916pub struct GitDiffEntry {
917    pub file_path: PathBuf,
918    pub change_type: DiffChangeType,
919    pub diff_content: String,
920    pub line_additions: usize,
921    pub line_deletions: usize,
922    pub commit_hash: Option<String>,
923    pub commit_message: Option<String>,
924    pub author: Option<String>,
925    pub timestamp: Option<u64>,
926    pub old_file_path: Option<PathBuf>, // For renames
927}
928
929/// Type of diff change
930#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
931pub enum DiffChangeType {
932    Added,
933    Modified,
934    Deleted,
935    Renamed,
936    Copied,
937}
938
939/// Configuration for diff-based analysis
940#[derive(Debug, Clone)]
941pub struct DiffAnalysisConfig {
942    pub include_staged: bool,
943    pub include_unstaged: bool,
944    pub include_commits: Option<Vec<String>>,
945    pub commit_range: Option<String>,
946    pub branch_comparison: Option<String>,
947    pub max_commits: usize,
948    pub max_diff_size_kb: usize,
949    pub ignore_patterns: Vec<String>,
950    pub relevance_threshold: f64,
951    pub include_binary_diffs: bool,
952    pub include_generated_files: bool,
953    pub max_lines_per_diff: usize,
954}
955
956impl Default for DiffAnalysisConfig {
957    fn default() -> Self {
958        Self {
959            include_staged: true,
960            include_unstaged: true,
961            include_commits: None,
962            commit_range: None,
963            branch_comparison: None,
964            max_commits: 50,
965            max_diff_size_kb: 100,
966            ignore_patterns: vec![
967                "*.lock".to_string(),
968                "*.log".to_string(),
969                "*.tmp".to_string(),
970                "*.cache".to_string(),
971                "node_modules/*".to_string(),
972                ".git/*".to_string(),
973                "__pycache__/*".to_string(),
974                "*.min.js".to_string(),
975                "*.min.css".to_string(),
976                "build/*".to_string(),
977                "dist/*".to_string(),
978            ],
979            relevance_threshold: 0.1,
980            include_binary_diffs: false,
981            include_generated_files: false,
982            max_lines_per_diff: 1000,
983        }
984    }
985}
986
987/// Diff analysis result containing all extracted changes
988#[derive(Debug, Clone, Serialize, Deserialize)]
989pub struct DiffAnalysisResult {
990    pub diffs: Vec<GitDiffEntry>,
991    pub total_files_changed: usize,
992    pub total_additions: usize,
993    pub total_deletions: usize,
994    pub commit_range_analyzed: Option<String>,
995    pub analysis_timestamp: u64,
996}
997
998impl GitIntegrator {
999    /// Perform comprehensive diff-based analysis
1000    pub async fn analyze_diffs(&self, config: &DiffAnalysisConfig) -> Result<DiffAnalysisResult> {
1001        if !self.git_available {
1002            return Err(ScribeError::git(
1003                "Git not available for diff analysis".to_string(),
1004            ));
1005        }
1006
1007        let mut all_diffs = Vec::new();
1008
1009        // Extract staged diffs
1010        if config.include_staged {
1011            let staged_diffs = self.extract_staged_diffs(config).await?;
1012            all_diffs.extend(staged_diffs);
1013        }
1014
1015        // Extract unstaged diffs
1016        if config.include_unstaged {
1017            let unstaged_diffs = self.extract_unstaged_diffs(config).await?;
1018            all_diffs.extend(unstaged_diffs);
1019        }
1020
1021        // Extract diffs from specific commits
1022        if let Some(ref commits) = config.include_commits {
1023            for commit_hash in commits {
1024                let commit_diffs = self.extract_commit_diffs(commit_hash, config).await?;
1025                all_diffs.extend(commit_diffs);
1026            }
1027        }
1028
1029        // Extract diffs from commit range
1030        if let Some(ref range) = config.commit_range {
1031            let range_diffs = self.extract_range_diffs(range, config).await?;
1032            all_diffs.extend(range_diffs);
1033        }
1034
1035        // Extract diffs from branch comparison
1036        if let Some(ref branch_comp) = config.branch_comparison {
1037            let branch_diffs = self
1038                .extract_branch_comparison_diffs(branch_comp, config)
1039                .await?;
1040            all_diffs.extend(branch_diffs);
1041        }
1042
1043        // Apply filtering
1044        all_diffs = self.filter_diffs(all_diffs, config).await?;
1045
1046        // Calculate summary statistics
1047        let total_files_changed = all_diffs.len();
1048        let total_additions = all_diffs.iter().map(|d| d.line_additions).sum();
1049        let total_deletions = all_diffs.iter().map(|d| d.line_deletions).sum();
1050
1051        let analysis_timestamp = SystemTime::now()
1052            .duration_since(UNIX_EPOCH)
1053            .unwrap()
1054            .as_secs();
1055
1056        Ok(DiffAnalysisResult {
1057            diffs: all_diffs,
1058            total_files_changed,
1059            total_additions,
1060            total_deletions,
1061            commit_range_analyzed: config.commit_range.clone(),
1062            analysis_timestamp,
1063        })
1064    }
1065
1066    /// Extract staged diffs (changes ready to be committed)
1067    async fn extract_staged_diffs(&self, config: &DiffAnalysisConfig) -> Result<Vec<GitDiffEntry>> {
1068        let output = AsyncCommand::new("git")
1069            .arg("diff")
1070            .arg("--cached")
1071            .arg("--numstat")
1072            .current_dir(&self.repo_path)
1073            .output()
1074            .await
1075            .map_err(|e| ScribeError::git(format!("Failed to get staged diffs: {}", e)))?;
1076
1077        if !output.status.success() {
1078            return Ok(Vec::new());
1079        }
1080
1081        let stdout = String::from_utf8_lossy(&output.stdout);
1082        self.parse_numstat_output(&stdout, DiffSource::Staged).await
1083    }
1084
1085    /// Extract unstaged diffs (working directory changes)
1086    async fn extract_unstaged_diffs(
1087        &self,
1088        config: &DiffAnalysisConfig,
1089    ) -> Result<Vec<GitDiffEntry>> {
1090        let output = AsyncCommand::new("git")
1091            .arg("diff")
1092            .arg("--numstat")
1093            .current_dir(&self.repo_path)
1094            .output()
1095            .await
1096            .map_err(|e| ScribeError::git(format!("Failed to get unstaged diffs: {}", e)))?;
1097
1098        if !output.status.success() {
1099            return Ok(Vec::new());
1100        }
1101
1102        let stdout = String::from_utf8_lossy(&output.stdout);
1103        self.parse_numstat_output(&stdout, DiffSource::Unstaged)
1104            .await
1105    }
1106
1107    /// Extract diffs from a specific commit
1108    async fn extract_commit_diffs(
1109        &self,
1110        commit_hash: &str,
1111        config: &DiffAnalysisConfig,
1112    ) -> Result<Vec<GitDiffEntry>> {
1113        let output = AsyncCommand::new("git")
1114            .arg("show")
1115            .arg("--numstat")
1116            .arg("--name-status")
1117            .arg("--pretty=format:%H|%an|%at|%s")
1118            .arg(commit_hash)
1119            .current_dir(&self.repo_path)
1120            .output()
1121            .await
1122            .map_err(|e| ScribeError::git(format!("Failed to get commit diffs: {}", e)))?;
1123
1124        if !output.status.success() {
1125            return Ok(Vec::new());
1126        }
1127
1128        let stdout = String::from_utf8_lossy(&output.stdout);
1129        self.parse_commit_diff_output(&stdout, commit_hash).await
1130    }
1131
1132    /// Extract diffs from a commit range
1133    async fn extract_range_diffs(
1134        &self,
1135        range: &str,
1136        config: &DiffAnalysisConfig,
1137    ) -> Result<Vec<GitDiffEntry>> {
1138        let output = AsyncCommand::new("git")
1139            .arg("log")
1140            .arg("--numstat")
1141            .arg("--pretty=format:%H|%an|%at|%s")
1142            .arg(format!("--max-count={}", config.max_commits))
1143            .arg(range)
1144            .current_dir(&self.repo_path)
1145            .output()
1146            .await
1147            .map_err(|e| ScribeError::git(format!("Failed to get range diffs: {}", e)))?;
1148
1149        if !output.status.success() {
1150            return Ok(Vec::new());
1151        }
1152
1153        let stdout = String::from_utf8_lossy(&output.stdout);
1154        log::debug!("Git log output for range {}: '{}'", range, stdout);
1155        self.parse_log_diff_output(&stdout).await
1156    }
1157
1158    /// Extract diffs from branch comparison
1159    async fn extract_branch_comparison_diffs(
1160        &self,
1161        branch_comp: &str,
1162        config: &DiffAnalysisConfig,
1163    ) -> Result<Vec<GitDiffEntry>> {
1164        let output = AsyncCommand::new("git")
1165            .arg("diff")
1166            .arg("--numstat")
1167            .arg(branch_comp)
1168            .current_dir(&self.repo_path)
1169            .output()
1170            .await
1171            .map_err(|e| {
1172                ScribeError::git(format!("Failed to get branch comparison diffs: {}", e))
1173            })?;
1174
1175        if !output.status.success() {
1176            return Ok(Vec::new());
1177        }
1178
1179        let stdout = String::from_utf8_lossy(&output.stdout);
1180        self.parse_numstat_output(&stdout, DiffSource::BranchComparison)
1181            .await
1182    }
1183
1184    /// Parse git numstat output format
1185    async fn parse_numstat_output(
1186        &self,
1187        output: &str,
1188        source: DiffSource,
1189    ) -> Result<Vec<GitDiffEntry>> {
1190        let mut diffs = Vec::new();
1191
1192        for line in output.lines() {
1193            if line.trim().is_empty() {
1194                continue;
1195            }
1196
1197            // Parse numstat format: additions deletions filename
1198            let parts: Vec<&str> = line.split('\t').collect();
1199            if parts.len() >= 3 {
1200                // Handle binary files (shown as - -)
1201                let additions = if parts[0] == "-" {
1202                    0
1203                } else {
1204                    parts[0].parse::<usize>().unwrap_or(0)
1205                };
1206                let deletions = if parts[1] == "-" {
1207                    0
1208                } else {
1209                    parts[1].parse::<usize>().unwrap_or(0)
1210                };
1211                let file_path = PathBuf::from(parts[2]);
1212
1213                // Get the actual diff content
1214                let diff_content = self.get_file_diff_content(&file_path, &source).await?;
1215
1216                // Determine change type
1217                let change_type = self.determine_change_type(&file_path, &source).await?;
1218
1219                diffs.push(GitDiffEntry {
1220                    file_path,
1221                    change_type,
1222                    diff_content,
1223                    line_additions: additions,
1224                    line_deletions: deletions,
1225                    commit_hash: None,
1226                    commit_message: None,
1227                    author: None,
1228                    timestamp: None,
1229                    old_file_path: None,
1230                });
1231            }
1232        }
1233
1234        Ok(diffs)
1235    }
1236
1237    /// Parse commit diff output with metadata
1238    async fn parse_commit_diff_output(
1239        &self,
1240        output: &str,
1241        commit_hash: &str,
1242    ) -> Result<Vec<GitDiffEntry>> {
1243        let lines: Vec<&str> = output.lines().collect();
1244        let mut diffs = Vec::new();
1245
1246        if lines.is_empty() {
1247            return Ok(diffs);
1248        }
1249
1250        // Parse commit metadata from first line
1251        let (commit_info, author, timestamp, message) = if let Some(first_line) = lines.first() {
1252            if first_line.contains('|') && first_line.split('|').count() >= 4 {
1253                let parts: Vec<&str> = first_line.split('|').collect();
1254                (
1255                    Some(parts[0].to_string()),
1256                    Some(parts[1].to_string()),
1257                    parts[2].parse::<u64>().ok(),
1258                    Some(parts[3].to_string()),
1259                )
1260            } else {
1261                (Some(commit_hash.to_string()), None, None, None)
1262            }
1263        } else {
1264            (Some(commit_hash.to_string()), None, None, None)
1265        };
1266
1267        // Parse file changes (skip metadata line)
1268        for line in lines.iter().skip(1) {
1269            if line.trim().is_empty() {
1270                continue;
1271            }
1272
1273            // Parse numstat format: additions\tdeletions\tfilename
1274            let parts: Vec<&str> = line.split('\t').collect();
1275            if parts.len() >= 3 {
1276                let additions = if parts[0] == "-" {
1277                    0
1278                } else {
1279                    parts[0].parse::<usize>().unwrap_or(0)
1280                };
1281                let deletions = if parts[1] == "-" {
1282                    0
1283                } else {
1284                    parts[1].parse::<usize>().unwrap_or(0)
1285                };
1286                let file_path = PathBuf::from(parts[2]);
1287
1288                // Get the actual diff content for this commit
1289                let diff_content = self
1290                    .get_commit_file_diff_content(&file_path, commit_hash)
1291                    .await?;
1292                let change_type = DiffChangeType::Modified; // Default for commit diffs
1293
1294                diffs.push(GitDiffEntry {
1295                    file_path,
1296                    change_type,
1297                    diff_content,
1298                    line_additions: additions,
1299                    line_deletions: deletions,
1300                    commit_hash: commit_info.clone(),
1301                    commit_message: message.clone(),
1302                    author: author.clone(),
1303                    timestamp,
1304                    old_file_path: None,
1305                });
1306            }
1307        }
1308
1309        Ok(diffs)
1310    }
1311
1312    /// Parse git log diff output with multiple commits
1313    async fn parse_log_diff_output(&self, output: &str) -> Result<Vec<GitDiffEntry>> {
1314        let mut diffs = Vec::new();
1315        let lines: Vec<&str> = output.lines().collect();
1316        log::debug!("Parsing log diff output with {} lines", lines.len());
1317        let mut i = 0;
1318
1319        while i < lines.len() {
1320            let line = lines[i];
1321
1322            // Check if this is a commit metadata line
1323            if line.contains('|') && line.split('|').count() >= 4 {
1324                let parts: Vec<&str> = line.split('|').collect();
1325                let commit_hash = parts[0].to_string();
1326                let author = parts[1].to_string();
1327                let timestamp = parts[2].parse::<u64>().ok();
1328                let message = parts[3].to_string();
1329
1330                i += 1;
1331
1332                // Parse file changes for this commit (numstat format)
1333                while i < lines.len() && !lines[i].contains('|') {
1334                    let file_line = lines[i];
1335                    if file_line.trim().is_empty() {
1336                        i += 1;
1337                        continue;
1338                    }
1339
1340                    // Parse numstat format: additions\tdeletions\tfilename
1341                    let parts: Vec<&str> = file_line.split('\t').collect();
1342                    if parts.len() >= 3 {
1343                        let additions = if parts[0] == "-" {
1344                            0
1345                        } else {
1346                            parts[0].parse::<usize>().unwrap_or(0)
1347                        };
1348                        let deletions = if parts[1] == "-" {
1349                            0
1350                        } else {
1351                            parts[1].parse::<usize>().unwrap_or(0)
1352                        };
1353                        let file_path = PathBuf::from(parts[2]);
1354
1355                        let diff_content = self
1356                            .get_commit_file_diff_content(&file_path, &commit_hash)
1357                            .await?;
1358
1359                        diffs.push(GitDiffEntry {
1360                            file_path,
1361                            change_type: DiffChangeType::Modified,
1362                            diff_content,
1363                            line_additions: additions,
1364                            line_deletions: deletions,
1365                            commit_hash: Some(commit_hash.clone()),
1366                            commit_message: Some(message.clone()),
1367                            author: Some(author.clone()),
1368                            timestamp,
1369                            old_file_path: None,
1370                        });
1371                    }
1372                    i += 1;
1373                }
1374            } else {
1375                i += 1;
1376            }
1377        }
1378
1379        Ok(diffs)
1380    }
1381
1382    /// Get diff content for a specific file
1383    async fn get_file_diff_content(&self, file_path: &Path, source: &DiffSource) -> Result<String> {
1384        let mut cmd = AsyncCommand::new("git");
1385        cmd.arg("diff");
1386
1387        match source {
1388            DiffSource::Staged => {
1389                cmd.arg("--cached");
1390            }
1391            DiffSource::Unstaged => {}
1392            DiffSource::BranchComparison => {}
1393        }
1394
1395        let output = cmd
1396            .arg("--")
1397            .arg(file_path)
1398            .current_dir(&self.repo_path)
1399            .output()
1400            .await
1401            .map_err(|e| ScribeError::git(format!("Failed to get file diff: {}", e)))?;
1402
1403        if output.status.success() {
1404            Ok(String::from_utf8_lossy(&output.stdout).to_string())
1405        } else {
1406            Ok(String::new())
1407        }
1408    }
1409
1410    /// Get diff content for a file in a specific commit
1411    async fn get_commit_file_diff_content(
1412        &self,
1413        file_path: &Path,
1414        commit_hash: &str,
1415    ) -> Result<String> {
1416        let output = AsyncCommand::new("git")
1417            .arg("show")
1418            .arg(format!("{}:{}", commit_hash, file_path.display()))
1419            .current_dir(&self.repo_path)
1420            .output()
1421            .await
1422            .map_err(|e| ScribeError::git(format!("Failed to get commit file diff: {}", e)))?;
1423
1424        if output.status.success() {
1425            Ok(String::from_utf8_lossy(&output.stdout).to_string())
1426        } else {
1427            Ok(String::new())
1428        }
1429    }
1430
1431    /// Determine the type of change for a file
1432    async fn determine_change_type(
1433        &self,
1434        file_path: &Path,
1435        source: &DiffSource,
1436    ) -> Result<DiffChangeType> {
1437        let mut cmd = AsyncCommand::new("git");
1438        cmd.arg("status").arg("--porcelain");
1439
1440        let output = cmd
1441            .arg("--")
1442            .arg(file_path)
1443            .current_dir(&self.repo_path)
1444            .output()
1445            .await
1446            .map_err(|e| ScribeError::git(format!("Failed to determine change type: {}", e)))?;
1447
1448        if output.status.success() {
1449            let stdout = String::from_utf8_lossy(&output.stdout);
1450            if let Some(first_line) = stdout.lines().next() {
1451                let status_code = first_line.chars().take(2).collect::<String>();
1452                return Ok(match status_code.as_str() {
1453                    "A " | " A" => DiffChangeType::Added,
1454                    "D " | " D" => DiffChangeType::Deleted,
1455                    "R " | " R" => DiffChangeType::Renamed,
1456                    "C " | " C" => DiffChangeType::Copied,
1457                    _ => DiffChangeType::Modified,
1458                });
1459            }
1460        }
1461
1462        Ok(DiffChangeType::Modified)
1463    }
1464
1465    /// Apply filtering rules to diffs
1466    async fn filter_diffs(
1467        &self,
1468        mut diffs: Vec<GitDiffEntry>,
1469        config: &DiffAnalysisConfig,
1470    ) -> Result<Vec<GitDiffEntry>> {
1471        // Filter by ignore patterns
1472        diffs.retain(|diff| {
1473            !config.ignore_patterns.iter().any(|pattern| {
1474                // Simple pattern matching - could be enhanced with proper glob matching
1475                if pattern.ends_with("/*") {
1476                    let prefix = &pattern[..pattern.len() - 2];
1477                    diff.file_path.to_string_lossy().starts_with(prefix)
1478                } else if pattern.starts_with("*.") {
1479                    let suffix = &pattern[1..];
1480                    diff.file_path.to_string_lossy().ends_with(suffix)
1481                } else {
1482                    diff.file_path.to_string_lossy().contains(pattern)
1483                }
1484            })
1485        });
1486
1487        // Filter by diff size
1488        diffs.retain(|diff| {
1489            let diff_size_kb = diff.diff_content.len() / 1024;
1490            diff_size_kb <= config.max_diff_size_kb
1491        });
1492
1493        // Filter by line count
1494        diffs.retain(|diff| {
1495            let line_count = diff.line_additions + diff.line_deletions;
1496            line_count <= config.max_lines_per_diff
1497        });
1498
1499        // Filter binary files if not included
1500        if !config.include_binary_diffs {
1501            diffs.retain(|diff| !self.is_likely_binary_file(&diff.file_path));
1502        }
1503
1504        // Filter generated files if not included
1505        if !config.include_generated_files {
1506            diffs.retain(|diff| !self.is_likely_generated_file(&diff.file_path));
1507        }
1508
1509        Ok(diffs)
1510    }
1511
1512    /// Check if a file is likely binary
1513    fn is_likely_binary_file(&self, file_path: &Path) -> bool {
1514        if let Some(extension) = file_path.extension().and_then(|ext| ext.to_str()) {
1515            matches!(
1516                extension.to_lowercase().as_str(),
1517                "png"
1518                    | "jpg"
1519                    | "jpeg"
1520                    | "gif"
1521                    | "bmp"
1522                    | "ico"
1523                    | "svg"
1524                    | "pdf"
1525                    | "doc"
1526                    | "docx"
1527                    | "xls"
1528                    | "xlsx"
1529                    | "ppt"
1530                    | "pptx"
1531                    | "zip"
1532                    | "tar"
1533                    | "gz"
1534                    | "7z"
1535                    | "rar"
1536                    | "exe"
1537                    | "dll"
1538                    | "so"
1539                    | "dylib"
1540                    | "mp3"
1541                    | "mp4"
1542                    | "avi"
1543                    | "mov"
1544                    | "wav"
1545            )
1546        } else {
1547            false
1548        }
1549    }
1550
1551    /// Check if a file is likely generated
1552    fn is_likely_generated_file(&self, file_path: &Path) -> bool {
1553        let path_str = file_path.to_string_lossy().to_lowercase();
1554
1555        // Common generated file patterns
1556        path_str.contains("generated")
1557            || path_str.contains(".generated.")
1558            || path_str.contains("node_modules")
1559            || path_str.contains("__pycache__")
1560            || path_str.contains(".pyc")
1561            || path_str.contains("target/")
1562            || path_str.contains("build/")
1563            || path_str.contains("dist/")
1564            || path_str.ends_with(".min.js")
1565            || path_str.ends_with(".min.css")
1566            || path_str.contains("package-lock.json")
1567            || path_str.contains("yarn.lock")
1568            || path_str.contains("Cargo.lock")
1569    }
1570}
1571
1572/// Source of diff information
1573#[derive(Debug)]
1574enum DiffSource {
1575    Staged,
1576    Unstaged,
1577    BranchComparison,
1578}
1579
1580impl Default for AgeDistribution {
1581    fn default() -> Self {
1582        Self {
1583            recent: 0,
1584            moderate: 0,
1585            old: 0,
1586            ancient: 0,
1587        }
1588    }
1589}
1590
1591#[cfg(test)]
1592mod tests {
1593    use super::*;
1594    use std::fs;
1595    use std::process::Command;
1596    use tempfile::TempDir;
1597
1598    async fn create_test_git_repo() -> Result<TempDir> {
1599        let temp_dir = TempDir::new().unwrap();
1600        let repo_path = temp_dir.path();
1601
1602        // Initialize git repo
1603        let output = Command::new("git")
1604            .arg("init")
1605            .current_dir(repo_path)
1606            .output();
1607
1608        if output.is_err() || !output.unwrap().status.success() {
1609            // Skip tests if git is not available
1610            return Err(ScribeError::git(
1611                "Git not available for testing".to_string(),
1612            ));
1613        }
1614
1615        // Configure git for testing
1616        Command::new("git")
1617            .args(&["config", "user.name", "Test User"])
1618            .current_dir(repo_path)
1619            .output()
1620            .unwrap();
1621
1622        Command::new("git")
1623            .args(&["config", "user.email", "test@example.com"])
1624            .current_dir(repo_path)
1625            .output()
1626            .unwrap();
1627
1628        // Create and commit a test file
1629        let test_file = repo_path.join("test.rs");
1630        fs::write(&test_file, "fn main() { println!(\"Hello, world!\"); }").unwrap();
1631
1632        Command::new("git")
1633            .args(&["add", "test.rs"])
1634            .current_dir(repo_path)
1635            .output()
1636            .unwrap();
1637
1638        Command::new("git")
1639            .args(&["commit", "-m", "Initial commit"])
1640            .current_dir(repo_path)
1641            .output()
1642            .unwrap();
1643
1644        Ok(temp_dir)
1645    }
1646
1647    #[tokio::test]
1648    async fn test_git_integrator_creation() {
1649        if let Ok(temp_dir) = create_test_git_repo().await {
1650            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1651            assert!(integrator.is_git_available());
1652            assert_eq!(integrator.repo_path(), temp_dir.path());
1653        }
1654    }
1655
1656    #[tokio::test]
1657    async fn test_list_tracked_files() {
1658        if let Ok(temp_dir) = create_test_git_repo().await {
1659            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1660            let files = integrator.list_tracked_files().await.unwrap();
1661
1662            assert_eq!(files.len(), 1);
1663            assert!(files[0].file_name().unwrap() == "test.rs");
1664            assert_eq!(integrator.files_discovered(), 1);
1665        }
1666    }
1667
1668    #[tokio::test]
1669    async fn test_get_file_info() {
1670        if let Ok(temp_dir) = create_test_git_repo().await {
1671            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1672            let test_file = temp_dir.path().join("test.rs");
1673
1674            let file_info = integrator.get_file_info(&test_file).await.unwrap();
1675
1676            assert_eq!(file_info.path, test_file);
1677            assert_eq!(file_info.status, GitFileStatus::Unmodified);
1678            assert!(file_info.last_commit.is_some());
1679        }
1680    }
1681
1682    #[tokio::test]
1683    async fn test_get_repository_stats() {
1684        if let Ok(temp_dir) = create_test_git_repo().await {
1685            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1686            let stats = integrator.get_repository_stats().await.unwrap();
1687
1688            assert!(stats.total_commits >= 1);
1689            assert!(!stats.contributors.is_empty());
1690            assert!(stats.contributors[0].name == "Test User");
1691            assert!(stats.file_types.contains_key("rs"));
1692        }
1693    }
1694
1695    #[tokio::test]
1696    async fn test_file_status_detection() {
1697        if let Ok(temp_dir) = create_test_git_repo().await {
1698            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1699            let test_file = temp_dir.path().join("test.rs");
1700
1701            // File should be tracked initially
1702            let status = integrator.get_file_status(&test_file).await.unwrap();
1703            assert_eq!(status, GitFileStatus::Unmodified);
1704
1705            // Modify the file
1706            fs::write(&test_file, "fn main() { println!(\"Modified!\"); }").unwrap();
1707
1708            let status = integrator.get_file_status(&test_file).await.unwrap();
1709            assert_eq!(status, GitFileStatus::Modified);
1710
1711            // Create untracked file
1712            let new_file = temp_dir.path().join("untracked.rs");
1713            fs::write(&new_file, "// untracked").unwrap();
1714
1715            let status = integrator.get_file_status(&new_file).await.unwrap();
1716            assert_eq!(status, GitFileStatus::Untracked);
1717        }
1718    }
1719
1720    #[tokio::test]
1721    async fn test_blame_info() {
1722        if let Ok(temp_dir) = create_test_git_repo().await {
1723            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1724            let test_file = temp_dir.path().join("test.rs");
1725
1726            let blame_info = integrator.get_blame_info(&test_file).await.unwrap();
1727
1728            assert_eq!(blame_info.lines.len(), 1);
1729            assert!(!blame_info.contributors.is_empty());
1730            assert!(blame_info.contributors.contains_key("Test User"));
1731            assert!(blame_info.last_modified > 0);
1732        }
1733    }
1734
1735    #[test]
1736    fn test_age_distribution_calculation() {
1737        let now = SystemTime::now()
1738            .duration_since(UNIX_EPOCH)
1739            .unwrap()
1740            .as_secs();
1741
1742        let mut age_dist = AgeDistribution::default();
1743
1744        // Simulate line ages
1745        let recent_timestamp = now - (15 * 24 * 3600); // 15 days ago
1746        let moderate_timestamp = now - (90 * 24 * 3600); // 90 days ago
1747        let old_timestamp = now - (300 * 24 * 3600); // 300 days ago
1748        let ancient_timestamp = now - (400 * 24 * 3600); // 400 days ago
1749
1750        let timestamps = vec![
1751            recent_timestamp,
1752            moderate_timestamp,
1753            old_timestamp,
1754            ancient_timestamp,
1755        ];
1756
1757        for timestamp in timestamps {
1758            let age_seconds = now.saturating_sub(timestamp);
1759            let age_days = age_seconds / 86400;
1760
1761            match age_days {
1762                0..=30 => age_dist.recent += 1,
1763                31..=180 => age_dist.moderate += 1,
1764                181..=365 => age_dist.old += 1,
1765                _ => age_dist.ancient += 1,
1766            }
1767        }
1768
1769        assert_eq!(age_dist.recent, 1);
1770        assert_eq!(age_dist.moderate, 1);
1771        assert_eq!(age_dist.old, 1);
1772        assert_eq!(age_dist.ancient, 1);
1773    }
1774
1775    #[tokio::test]
1776    async fn test_cache_functionality() {
1777        if let Ok(temp_dir) = create_test_git_repo().await {
1778            let mut integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1779            let test_file = temp_dir.path().join("test.rs");
1780
1781            // First call should populate cache
1782            let _ = integrator.get_file_info(&test_file).await.unwrap();
1783            assert!(integrator.is_cache_valid());
1784
1785            // Clear cache
1786            integrator.clear_cache();
1787            assert!(!integrator.is_cache_valid());
1788        }
1789    }
1790
1791    #[tokio::test]
1792    async fn test_diff_analysis_staged_changes() {
1793        if let Ok(temp_dir) = create_test_git_repo().await {
1794            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1795
1796            // Modify and stage a file
1797            let test_file = temp_dir.path().join("test.rs");
1798            fs::write(
1799                &test_file,
1800                "fn main() { println!(\"Modified and staged!\"); }",
1801            )
1802            .unwrap();
1803
1804            Command::new("git")
1805                .args(&["add", "test.rs"])
1806                .current_dir(temp_dir.path())
1807                .output()
1808                .unwrap();
1809
1810            let config = DiffAnalysisConfig {
1811                include_staged: true,
1812                include_unstaged: false,
1813                ..Default::default()
1814            };
1815
1816            let result = integrator.analyze_diffs(&config).await.unwrap();
1817
1818            assert_eq!(result.total_files_changed, 1);
1819            assert!(result.total_additions > 0 || result.total_deletions > 0);
1820            assert!(!result.diffs.is_empty());
1821            assert_eq!(result.diffs[0].file_path.file_name().unwrap(), "test.rs");
1822        }
1823    }
1824
1825    #[tokio::test]
1826    async fn test_diff_analysis_unstaged_changes() {
1827        if let Ok(temp_dir) = create_test_git_repo().await {
1828            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1829
1830            // Modify but don't stage a file
1831            let test_file = temp_dir.path().join("test.rs");
1832            fs::write(
1833                &test_file,
1834                "fn main() { println!(\"Modified but not staged!\"); }",
1835            )
1836            .unwrap();
1837
1838            let config = DiffAnalysisConfig {
1839                include_staged: false,
1840                include_unstaged: true,
1841                ..Default::default()
1842            };
1843
1844            let result = integrator.analyze_diffs(&config).await.unwrap();
1845
1846            assert_eq!(result.total_files_changed, 1);
1847            assert!(!result.diffs.is_empty());
1848            assert_eq!(result.diffs[0].change_type, DiffChangeType::Modified);
1849        }
1850    }
1851
1852    #[tokio::test]
1853    async fn test_diff_analysis_commit_range() {
1854        if let Ok(temp_dir) = create_test_git_repo().await {
1855            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1856
1857            // Create a second commit by modifying existing file
1858            let test_file = temp_dir.path().join("test.rs");
1859            fs::write(
1860                &test_file,
1861                "fn main() { println!(\"Modified for second commit!\"); }",
1862            )
1863            .unwrap();
1864
1865            Command::new("git")
1866                .args(&["add", "test.rs"])
1867                .current_dir(temp_dir.path())
1868                .output()
1869                .unwrap();
1870
1871            Command::new("git")
1872                .args(&["commit", "-m", "Modify existing file"])
1873                .current_dir(temp_dir.path())
1874                .output()
1875                .unwrap();
1876
1877            let config = DiffAnalysisConfig {
1878                include_staged: false,
1879                include_unstaged: false,
1880                commit_range: Some("HEAD~1..HEAD".to_string()),
1881                ..Default::default()
1882            };
1883
1884            let result = integrator.analyze_diffs(&config).await.unwrap();
1885
1886            // Should find the modified file from the second commit
1887            assert!(
1888                !result.diffs.is_empty(),
1889                "Expected diffs but got: {:?}",
1890                result
1891            );
1892            let has_test_file = result
1893                .diffs
1894                .iter()
1895                .any(|d| d.file_path.file_name().unwrap() == "test.rs");
1896            assert!(
1897                has_test_file,
1898                "Expected test.rs in diffs but got: {:?}",
1899                result
1900                    .diffs
1901                    .iter()
1902                    .map(|d| &d.file_path)
1903                    .collect::<Vec<_>>()
1904            );
1905        }
1906    }
1907
1908    #[tokio::test]
1909    async fn test_diff_filtering() {
1910        if let Ok(temp_dir) = create_test_git_repo().await {
1911            let integrator = GitIntegrator::new(temp_dir.path()).unwrap();
1912
1913            // Create files with different extensions
1914            let js_file = temp_dir.path().join("test.js");
1915            let lock_file = temp_dir.path().join("package.lock");
1916
1917            fs::write(&js_file, "console.log('test');").unwrap();
1918            fs::write(&lock_file, "{ \"lockfileVersion\": 1 }").unwrap();
1919
1920            Command::new("git")
1921                .args(&["add", "."])
1922                .current_dir(temp_dir.path())
1923                .output()
1924                .unwrap();
1925
1926            let config = DiffAnalysisConfig {
1927                include_staged: true,
1928                include_unstaged: false,
1929                ignore_patterns: vec!["*.lock".to_string()],
1930                ..Default::default()
1931            };
1932
1933            let result = integrator.analyze_diffs(&config).await.unwrap();
1934
1935            // Should include .js file but exclude .lock file
1936            let has_js = result
1937                .diffs
1938                .iter()
1939                .any(|d| d.file_path.extension().unwrap() == "js");
1940            let has_lock = result
1941                .diffs
1942                .iter()
1943                .any(|d| d.file_path.extension().unwrap() == "lock");
1944
1945            assert!(has_js);
1946            assert!(!has_lock);
1947        }
1948    }
1949
1950    #[test]
1951    fn test_binary_file_detection() {
1952        let integrator = GitIntegrator {
1953            repo_path: PathBuf::from("/tmp"),
1954            git_available: true,
1955            cache: GitCache::default(),
1956        };
1957
1958        assert!(integrator.is_likely_binary_file(&PathBuf::from("image.png")));
1959        assert!(integrator.is_likely_binary_file(&PathBuf::from("document.pdf")));
1960        assert!(integrator.is_likely_binary_file(&PathBuf::from("archive.zip")));
1961        assert!(!integrator.is_likely_binary_file(&PathBuf::from("code.rs")));
1962        assert!(!integrator.is_likely_binary_file(&PathBuf::from("README.md")));
1963    }
1964
1965    #[test]
1966    fn test_generated_file_detection() {
1967        let integrator = GitIntegrator {
1968            repo_path: PathBuf::from("/tmp"),
1969            git_available: true,
1970            cache: GitCache::default(),
1971        };
1972
1973        assert!(integrator.is_likely_generated_file(&PathBuf::from("bundle.min.js")));
1974        assert!(integrator.is_likely_generated_file(&PathBuf::from("styles.min.css")));
1975        assert!(
1976            integrator.is_likely_generated_file(&PathBuf::from("node_modules/package/index.js"))
1977        );
1978        assert!(integrator.is_likely_generated_file(&PathBuf::from("target/debug/scribe")));
1979        assert!(integrator.is_likely_generated_file(&PathBuf::from("package-lock.json")));
1980        assert!(!integrator.is_likely_generated_file(&PathBuf::from("src/main.rs")));
1981        assert!(!integrator.is_likely_generated_file(&PathBuf::from("package.json")));
1982    }
1983
1984    #[test]
1985    fn test_diff_analysis_config_default() {
1986        let config = DiffAnalysisConfig::default();
1987
1988        assert!(config.include_staged);
1989        assert!(config.include_unstaged);
1990        assert_eq!(config.max_commits, 50);
1991        assert_eq!(config.max_diff_size_kb, 100);
1992        assert!(!config.include_binary_diffs);
1993        assert!(!config.include_generated_files);
1994        assert!(config.ignore_patterns.contains(&"*.lock".to_string()));
1995        assert!(config
1996            .ignore_patterns
1997            .contains(&"node_modules/*".to_string()));
1998    }
1999}