Skip to main content

git_comma/
filter.rs

1//! Smart Diff Filter — excludes machine-generated files from AI diff input.
2
3/// Domain type for filter options (Boolean Blindness prevention).
4#[derive(Debug, Clone, Copy, PartialEq)]
5pub enum FilterMode {
6    /// Default — filter machine-generated files
7    Smart,
8    /// User passed --no-filter — include everything
9    NoFilter,
10}
11
12/// Why a file was excluded from the diff.
13#[derive(Debug, Clone)]
14pub enum ExclusionReason {
15    BinaryFile,
16    MachineGeneratedLockfile,
17    MinifiedFile,
18    HeuristicSize { added: u32, deleted: u32 },
19}
20
21/// A file that will be excluded from the diff.
22#[derive(Debug, Clone)]
23pub struct ExcludedFile {
24    pub path: String,
25    pub reason: ExclusionReason,
26}
27
28/// Result of filtering — carries excluded files AND whether all staged files were excluded.
29#[derive(Debug)]
30pub struct FilterResult {
31    pub excluded: Vec<ExcludedFile>,
32    pub all_excluded: bool,
33}
34
35/// Errors from filter operations.
36#[derive(Debug, thiserror::Error)]
37pub enum FilterError {
38    #[error("Git numstat command failed")]
39    NumstatFailed(#[from] std::io::Error),
40    #[error("Failed to parse numstat line: '{line}'")]
41    ParseError { line: String },
42}
43
44/// Extract basename from a path (last segment after /).
45fn get_basename(path: &str) -> &str {
46    path.rsplit('/').next().unwrap_or(path)
47}
48
49/// Returns true if the file is a known machine-generated lockfile (exact basename match only).
50fn is_machine_generated_lockfile(basename: &str) -> bool {
51    matches!(
52        basename,
53        "package-lock.json" | "pnpm-lock.yaml" | "yarn.lock"
54            | "Cargo.lock" | "go.sum"
55    )
56}
57
58/// Returns true if the file is a minified JavaScript or CSS file.
59fn is_minified_file(basename: &str) -> bool {
60    basename.ends_with(".min.js") || basename.ends_with(".min.css")
61}
62
63/// Converts excluded files to git :(exclude)path args.
64/// Correct git pathspec syntax: :(exclude)path — parenthesis closes BEFORE the path.
65pub fn build_git_exclude_args(excluded: &[ExcludedFile]) -> Vec<String> {
66    excluded
67        .iter()
68        .map(|e| format!(":(exclude){}", e.path))
69        .collect()
70}
71
72/// Runs the numstat heuristic pipeline and returns files to exclude.
73pub fn filter_staged_files(
74    mode: FilterMode,
75) -> Result<FilterResult, FilterError> {
76    // NoFilter mode: skip all checks, include everything
77    if mode == FilterMode::NoFilter {
78        return Ok(FilterResult {
79            excluded: Vec::new(),
80            all_excluded: false,
81        });
82    }
83
84    // Run git diff --cached --numstat
85    let output = std::process::Command::new("git")
86        .args(["diff", "--cached", "--numstat"])
87        .output()?;
88
89    if !output.status.success() {
90        return Err(FilterError::NumstatFailed(std::io::Error::new(
91            std::io::ErrorKind::Other,
92            String::from_utf8_lossy(&output.stderr),
93        )));
94    }
95
96    let stdout = String::from_utf8_lossy(&output.stdout);
97    let mut excluded = Vec::new();
98    let mut total_staged = 0;
99
100    for line in stdout.lines() {
101        let parts: Vec<&str> = line.split('\t').collect();
102        if parts.len() < 3 {
103            // Malformed line — skip
104            continue;
105        }
106
107        let path = parts[2];
108        if path.is_empty() {
109            // Empty path — skip
110            continue;
111        }
112
113        total_staged += 1;
114        let added_str = parts[0];
115        let deleted_str = parts[1];
116
117        // Binary file check
118        if added_str == "-" && deleted_str == "-" {
119            excluded.push(ExcludedFile {
120                path: path.to_string(),
121                reason: ExclusionReason::BinaryFile,
122            });
123            continue;
124        }
125
126        let added: u32 = added_str.parse().unwrap_or(0);
127        let deleted: u32 = deleted_str.parse().unwrap_or(0);
128
129        // Size heuristic check
130        if added + deleted > 500 {
131            excluded.push(ExcludedFile {
132                path: path.to_string(),
133                reason: ExclusionReason::HeuristicSize { added, deleted },
134            });
135            continue;
136        }
137
138        // Basename exact-match checks
139        let basename = get_basename(path);
140
141        if is_machine_generated_lockfile(basename) {
142            excluded.push(ExcludedFile {
143                path: path.to_string(),
144                reason: ExclusionReason::MachineGeneratedLockfile,
145            });
146            continue;
147        }
148
149        if is_minified_file(basename) {
150            excluded.push(ExcludedFile {
151                path: path.to_string(),
152                reason: ExclusionReason::MinifiedFile,
153            });
154            continue;
155        }
156
157        // Otherwise: SAFE — do not exclude
158    }
159
160    let all_excluded = excluded.len() == total_staged && total_staged > 0;
161
162    Ok(FilterResult {
163        excluded,
164        all_excluded,
165    })
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_lockfile_exact_match() {
174        assert!(is_machine_generated_lockfile("Cargo.lock"));
175        assert!(is_machine_generated_lockfile("package-lock.json"));
176        assert!(is_machine_generated_lockfile("pnpm-lock.yaml"));
177        assert!(is_machine_generated_lockfile("yarn.lock"));
178        assert!(is_machine_generated_lockfile("go.sum"));
179    }
180
181    #[test]
182    fn test_lockfile_false_positives() {
183        // These have "lock" or "sum" in the name but are NOT lockfiles
184        assert!(!is_machine_generated_lockfile("lock-screen.jsx"));
185        assert!(!is_machine_generated_lockfile("calculate_sum.ts"));
186        assert!(!is_machine_generated_lockfile("user-lock.json"));
187    }
188
189    #[test]
190    fn test_minified_file() {
191        assert!(is_minified_file("bundle.min.js"));
192        assert!(is_minified_file("styles.min.css"));
193        assert!(!is_minified_file("main.js"));
194        assert!(!is_minified_file("app.min.jsx")); // .jsx, not .js
195        assert!(is_minified_file("normalize.min.css"));
196    }
197
198    #[test]
199    fn test_basename_extraction() {
200        assert_eq!(get_basename("src/main.rs"), "main.rs");
201        assert_eq!(get_basename("frontend/src/components/lock-screen.jsx"), "lock-screen.jsx");
202        assert_eq!(get_basename("Cargo.lock"), "Cargo.lock");
203        assert_eq!(get_basename("package-lock.json"), "package-lock.json");
204    }
205
206    #[test]
207    fn test_no_filter_returns_empty() {
208        let result = filter_staged_files(FilterMode::NoFilter).unwrap();
209        assert!(!result.all_excluded);
210        assert!(result.excluded.is_empty());
211    }
212
213    #[test]
214    fn test_build_git_exclude_args() {
215        let excluded = vec![
216            ExcludedFile {
217                path: "Cargo.lock".into(),
218                reason: ExclusionReason::MachineGeneratedLockfile,
219            },
220            ExcludedFile {
221                path: "package-lock.json".into(),
222                reason: ExclusionReason::MachineGeneratedLockfile,
223            },
224        ];
225        let args = build_git_exclude_args(&excluded);
226        assert_eq!(args, vec![":(exclude)Cargo.lock", ":(exclude)package-lock.json"]);
227    }
228}