Skip to main content

git_comma/
filter.rs

1//! Smart Diff Filter — excludes machine-generated files from AI diff input.
2
3/// Domain type for filter options (Boolean Blindness prevention).
4#[derive(Debug, Clone, Copy, PartialEq)]
5pub enum FilterMode {
6    /// Default — filter machine-generated files
7    Smart,
8    /// User passed --no-filter — include everything
9    NoFilter,
10}
11
12/// Why a file was excluded from the diff.
13#[derive(Debug, Clone)]
14pub enum ExclusionReason {
15    BinaryFile,
16    MachineGeneratedLockfile,
17    MinifiedFile,
18    HeuristicSize { added: u32, deleted: u32 },
19}
20
21/// A file that will be excluded from the diff.
22#[derive(Debug, Clone)]
23pub struct ExcludedFile {
24    pub path: String,
25    pub reason: ExclusionReason,
26}
27
28/// Result of filtering — carries excluded files AND whether all staged files were excluded.
29#[derive(Debug)]
30pub struct FilterResult {
31    pub excluded: Vec<ExcludedFile>,
32    pub all_excluded: bool,
33}
34
35impl FilterResult {
36    /// True only when every exclusion is machine-generated (lockfile, binary, or minified).
37    /// HeuristicSize exclusions do NOT count as machine-generated.
38    pub fn all_machine_generated(&self) -> bool {
39        self.all_excluded
40            && self.excluded.iter().all(|f| {
41                matches!(
42                    f.reason,
43                    ExclusionReason::BinaryFile
44                        | ExclusionReason::MachineGeneratedLockfile
45                        | ExclusionReason::MinifiedFile
46                )
47            })
48    }
49}
50
51/// Errors from filter operations.
52#[derive(Debug, thiserror::Error)]
53pub enum FilterError {
54    #[error("Git numstat command failed")]
55    NumstatFailed(#[from] std::io::Error),
56    #[error("Failed to parse numstat line: '{line}'")]
57    ParseError { line: String },
58}
59
60/// Extract basename from a path (last segment after /).
61fn get_basename(path: &str) -> &str {
62    path.rsplit('/').next().unwrap_or(path)
63}
64
65/// Returns true if the file is a known machine-generated lockfile (exact basename match only).
66fn is_machine_generated_lockfile(basename: &str) -> bool {
67    matches!(
68        basename,
69        "package-lock.json" | "pnpm-lock.yaml" | "yarn.lock"
70            | "Cargo.lock" | "go.sum"
71    )
72}
73
74/// Returns true if the file is a minified JavaScript or CSS file.
75fn is_minified_file(basename: &str) -> bool {
76    basename.ends_with(".min.js") || basename.ends_with(".min.css")
77}
78
79/// Converts excluded files to git :(exclude)path args.
80/// Correct git pathspec syntax: :(exclude)path — parenthesis closes BEFORE the path.
81pub fn build_git_exclude_args(excluded: &[ExcludedFile]) -> Vec<String> {
82    excluded
83        .iter()
84        .map(|e| format!(":(exclude){}", e.path))
85        .collect()
86}
87
88/// Runs the numstat heuristic pipeline and returns files to exclude.
89pub fn filter_staged_files(
90    mode: FilterMode,
91) -> Result<FilterResult, FilterError> {
92    // NoFilter mode: skip all checks, include everything
93    if mode == FilterMode::NoFilter {
94        return Ok(FilterResult {
95            excluded: Vec::new(),
96            all_excluded: false,
97        });
98    }
99
100    // Run git diff --cached --numstat
101    let output = std::process::Command::new("git")
102        .args(["diff", "--cached", "--numstat"])
103        .output()?;
104
105    if !output.status.success() {
106        return Err(FilterError::NumstatFailed(std::io::Error::new(
107            std::io::ErrorKind::Other,
108            String::from_utf8_lossy(&output.stderr),
109        )));
110    }
111
112    let stdout = String::from_utf8_lossy(&output.stdout);
113    let mut excluded = Vec::new();
114    let mut total_staged = 0;
115
116    for line in stdout.lines() {
117        let parts: Vec<&str> = line.split('\t').collect();
118        if parts.len() < 3 {
119            // Malformed line — skip
120            continue;
121        }
122
123        let path = parts[2];
124        if path.is_empty() {
125            // Empty path — skip
126            continue;
127        }
128
129        total_staged += 1;
130        let added_str = parts[0];
131        let deleted_str = parts[1];
132
133        // Binary file check
134        if added_str == "-" && deleted_str == "-" {
135            excluded.push(ExcludedFile {
136                path: path.to_string(),
137                reason: ExclusionReason::BinaryFile,
138            });
139            continue;
140        }
141
142        let added: u32 = added_str.parse().unwrap_or(0);
143        let deleted: u32 = deleted_str.parse().unwrap_or(0);
144
145        // Size heuristic check
146        if added + deleted > 500 {
147            excluded.push(ExcludedFile {
148                path: path.to_string(),
149                reason: ExclusionReason::HeuristicSize { added, deleted },
150            });
151            continue;
152        }
153
154        // Basename exact-match checks
155        let basename = get_basename(path);
156
157        if is_machine_generated_lockfile(basename) {
158            excluded.push(ExcludedFile {
159                path: path.to_string(),
160                reason: ExclusionReason::MachineGeneratedLockfile,
161            });
162            continue;
163        }
164
165        if is_minified_file(basename) {
166            excluded.push(ExcludedFile {
167                path: path.to_string(),
168                reason: ExclusionReason::MinifiedFile,
169            });
170            continue;
171        }
172
173        // Otherwise: SAFE — do not exclude
174    }
175
176    let all_excluded = excluded.len() == total_staged && total_staged > 0;
177
178    Ok(FilterResult {
179        excluded,
180        all_excluded,
181    })
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn test_lockfile_exact_match() {
190        assert!(is_machine_generated_lockfile("Cargo.lock"));
191        assert!(is_machine_generated_lockfile("package-lock.json"));
192        assert!(is_machine_generated_lockfile("pnpm-lock.yaml"));
193        assert!(is_machine_generated_lockfile("yarn.lock"));
194        assert!(is_machine_generated_lockfile("go.sum"));
195    }
196
197    #[test]
198    fn test_lockfile_false_positives() {
199        // These have "lock" or "sum" in the name but are NOT lockfiles
200        assert!(!is_machine_generated_lockfile("lock-screen.jsx"));
201        assert!(!is_machine_generated_lockfile("calculate_sum.ts"));
202        assert!(!is_machine_generated_lockfile("user-lock.json"));
203    }
204
205    #[test]
206    fn test_minified_file() {
207        assert!(is_minified_file("bundle.min.js"));
208        assert!(is_minified_file("styles.min.css"));
209        assert!(!is_minified_file("main.js"));
210        assert!(!is_minified_file("app.min.jsx")); // .jsx, not .js
211        assert!(is_minified_file("normalize.min.css"));
212    }
213
214    #[test]
215    fn test_basename_extraction() {
216        assert_eq!(get_basename("src/main.rs"), "main.rs");
217        assert_eq!(get_basename("frontend/src/components/lock-screen.jsx"), "lock-screen.jsx");
218        assert_eq!(get_basename("Cargo.lock"), "Cargo.lock");
219        assert_eq!(get_basename("package-lock.json"), "package-lock.json");
220    }
221
222    #[test]
223    fn test_no_filter_returns_empty() {
224        let result = filter_staged_files(FilterMode::NoFilter).unwrap();
225        assert!(!result.all_excluded);
226        assert!(result.excluded.is_empty());
227    }
228
229    #[test]
230    fn test_build_git_exclude_args() {
231        let excluded = vec![
232            ExcludedFile {
233                path: "Cargo.lock".into(),
234                reason: ExclusionReason::MachineGeneratedLockfile,
235            },
236            ExcludedFile {
237                path: "package-lock.json".into(),
238                reason: ExclusionReason::MachineGeneratedLockfile,
239            },
240        ];
241        let args = build_git_exclude_args(&excluded);
242        assert_eq!(args, vec![":(exclude)Cargo.lock", ":(exclude)package-lock.json"]);
243    }
244
245    #[test]
246    fn test_all_machine_generated_true_all_lockfiles() {
247        let result = FilterResult {
248            excluded: vec![
249                ExcludedFile {
250                    path: "Cargo.lock".into(),
251                    reason: ExclusionReason::MachineGeneratedLockfile,
252                },
253                ExcludedFile {
254                    path: "package-lock.json".into(),
255                    reason: ExclusionReason::MachineGeneratedLockfile,
256                },
257            ],
258            all_excluded: true,
259        };
260        assert!(result.all_machine_generated());
261    }
262
263    #[test]
264    fn test_all_machine_generated_true_binary_files() {
265        let result = FilterResult {
266            excluded: vec![
267                ExcludedFile {
268                    path: "image.png".into(),
269                    reason: ExclusionReason::BinaryFile,
270                },
271            ],
272            all_excluded: true,
273        };
274        assert!(result.all_machine_generated());
275    }
276
277    #[test]
278    fn test_all_machine_generated_true_mixed_machine() {
279        let result = FilterResult {
280            excluded: vec![
281                ExcludedFile {
282                    path: "Cargo.lock".into(),
283                    reason: ExclusionReason::MachineGeneratedLockfile,
284                },
285                ExcludedFile {
286                    path: "image.png".into(),
287                    reason: ExclusionReason::BinaryFile,
288                },
289                ExcludedFile {
290                    path: "bundle.min.js".into(),
291                    reason: ExclusionReason::MinifiedFile,
292                },
293            ],
294            all_excluded: true,
295        };
296        assert!(result.all_machine_generated());
297    }
298
299    #[test]
300    fn test_all_machine_generated_false_with_heuristic() {
301        let result = FilterResult {
302            excluded: vec![
303                ExcludedFile {
304                    path: "src/main.rs".into(),
305                    reason: ExclusionReason::HeuristicSize { added: 600, deleted: 0 },
306                },
307            ],
308            all_excluded: true,
309        };
310        assert!(!result.all_machine_generated());
311    }
312
313    #[test]
314    fn test_all_machine_generated_false_mixed_heuristic() {
315        let result = FilterResult {
316            excluded: vec![
317                ExcludedFile {
318                    path: "Cargo.lock".into(),
319                    reason: ExclusionReason::MachineGeneratedLockfile,
320                },
321                ExcludedFile {
322                    path: "src/main.rs".into(),
323                    reason: ExclusionReason::HeuristicSize { added: 600, deleted: 0 },
324                },
325            ],
326            all_excluded: true,
327        };
328        assert!(!result.all_machine_generated());
329    }
330
331    #[test]
332    fn test_all_machine_generated_false_not_all_excluded() {
333        // Even if all excluded items are machine-generated, if not ALL staged files are excluded
334        // (i.e., some files are included in the diff), return false
335        let result = FilterResult {
336            excluded: vec![
337                ExcludedFile {
338                    path: "Cargo.lock".into(),
339                    reason: ExclusionReason::MachineGeneratedLockfile,
340                },
341            ],
342            all_excluded: false, // Some files were NOT excluded
343        };
344        assert!(!result.all_machine_generated());
345    }
346
347    #[test]
348    fn test_all_machine_generated_false_empty_excluded() {
349        let result = FilterResult {
350            excluded: vec![],
351            all_excluded: false,
352        };
353        assert!(!result.all_machine_generated());
354    }
355}