Skip to main content

argus_difflens/
filter.rs

1//! Pre-LLM file filtering to eliminate noise at the source.
2//!
3//! Filters out lock files, generated code, vendored dependencies,
4//! minified files, and files matching custom patterns before they
5//! reach the LLM, saving tokens and reducing false positives.
6
7use std::path::{Path, PathBuf};
8
9use argus_core::ReviewConfig;
10
11use crate::parser::FileDiff;
12
13/// Files and patterns to skip before sending to LLM.
14///
15/// # Examples
16///
17/// ```
18/// use argus_difflens::filter::DiffFilter;
19///
20/// let filter = DiffFilter::default_filter();
21/// assert!(filter.should_skip("package-lock.json"));
22/// assert!(!filter.should_skip("src/main.rs"));
23/// ```
24pub struct DiffFilter {
25    skip_patterns: Vec<glob::Pattern>,
26    skip_extensions: Vec<String>,
27    max_file_size_lines: usize,
28}
29
30impl DiffFilter {
31    /// Create a filter with sensible defaults.
32    ///
33    /// # Examples
34    ///
35    /// ```
36    /// use argus_difflens::filter::DiffFilter;
37    ///
38    /// let filter = DiffFilter::default_filter();
39    /// assert!(filter.should_skip("yarn.lock"));
40    /// ```
41    pub fn default_filter() -> Self {
42        Self {
43            skip_patterns: Vec::new(),
44            skip_extensions: Vec::new(),
45            max_file_size_lines: 1000,
46        }
47    }
48
49    /// Create a filter from review configuration.
50    ///
51    /// # Examples
52    ///
53    /// ```
54    /// use argus_core::ReviewConfig;
55    /// use argus_difflens::filter::DiffFilter;
56    ///
57    /// let config = ReviewConfig::default();
58    /// let filter = DiffFilter::from_config(&config);
59    /// assert!(filter.should_skip("Cargo.lock"));
60    /// ```
61    pub fn from_config(config: &ReviewConfig) -> Self {
62        let mut skip_patterns = Vec::new();
63        for pat in &config.skip_patterns {
64            if let Ok(p) = glob::Pattern::new(pat) {
65                skip_patterns.push(p);
66            }
67        }
68
69        Self {
70            skip_patterns,
71            skip_extensions: config.skip_extensions.clone(),
72            max_file_size_lines: 1000,
73        }
74    }
75
76    /// Check if a single file path should be skipped.
77    ///
78    /// # Examples
79    ///
80    /// ```
81    /// use argus_difflens::filter::DiffFilter;
82    ///
83    /// let filter = DiffFilter::default_filter();
84    /// assert!(filter.should_skip("vendor/lib.js"));
85    /// assert!(!filter.should_skip("src/lib.rs"));
86    /// ```
87    pub fn should_skip(&self, path: &str) -> bool {
88        self.check_skip(Path::new(path), "", 0).is_some()
89    }
90
91    /// Filter a list of `FileDiff`s, returning only reviewable ones.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// use argus_difflens::filter::DiffFilter;
97    /// use argus_difflens::parser::parse_unified_diff;
98    ///
99    /// let diff = "diff --git a/src/main.rs b/src/main.rs\n\
100    ///             --- a/src/main.rs\n\
101    ///             +++ b/src/main.rs\n\
102    ///             @@ -1,2 +1,3 @@\n\
103    ///              line\n\
104    ///             +new\n";
105    /// let diffs = parse_unified_diff(diff).unwrap();
106    /// let filter = DiffFilter::default_filter();
107    /// let result = filter.filter(diffs);
108    /// assert_eq!(result.kept.len(), 1);
109    /// assert!(result.skipped.is_empty());
110    /// ```
111    pub fn filter(&self, diffs: Vec<FileDiff>) -> FilterResult {
112        let mut kept = Vec::new();
113        let mut skipped = Vec::new();
114
115        for diff in diffs {
116            let path = &diff.new_path;
117            let path_str = path.to_string_lossy();
118
119            let content = Self::collect_hunk_content(&diff);
120            let changed_lines = Self::count_changed_lines(&diff);
121
122            if let Some(reason) = self.check_skip(path, &content, changed_lines) {
123                skipped.push(SkippedFile {
124                    path: path.clone(),
125                    reason,
126                });
127            } else {
128                // Check custom pattern matches
129                let mut matched = false;
130                for pat in &self.skip_patterns {
131                    if pat.matches(&path_str) {
132                        skipped.push(SkippedFile {
133                            path: path.clone(),
134                            reason: SkipReason::PatternMatch(pat.to_string()),
135                        });
136                        matched = true;
137                        break;
138                    }
139                }
140                if !matched {
141                    kept.push(diff);
142                }
143            }
144        }
145
146        FilterResult { kept, skipped }
147    }
148
149    fn check_skip(&self, path: &Path, content: &str, changed_lines: usize) -> Option<SkipReason> {
150        let path_str = path.to_string_lossy();
151        let file_name = path
152            .file_name()
153            .map(|f| f.to_string_lossy().to_string())
154            .unwrap_or_default();
155
156        // Lock files
157        if is_lock_file(&file_name) {
158            return Some(SkipReason::LockFile);
159        }
160
161        // Vendored code
162        if is_vendored(&path_str) {
163            return Some(SkipReason::VendoredCode);
164        }
165
166        // Minified files
167        if is_minified(&file_name, content) {
168            return Some(SkipReason::MinifiedFile);
169        }
170
171        // Generated files (by name pattern)
172        if is_generated_by_name(&file_name) {
173            return Some(SkipReason::GeneratedFile);
174        }
175
176        // Generated files (by content header)
177        if is_generated_by_content(content) {
178            return Some(SkipReason::GeneratedFile);
179        }
180
181        // Custom extension skip
182        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
183            for skip_ext in &self.skip_extensions {
184                if ext == skip_ext {
185                    return Some(SkipReason::PatternMatch(format!("*.{skip_ext}")));
186                }
187            }
188        }
189
190        // Too large
191        if changed_lines > self.max_file_size_lines {
192            return Some(SkipReason::TooLarge);
193        }
194
195        None
196    }
197
198    fn collect_hunk_content(diff: &FileDiff) -> String {
199        let mut content = String::new();
200        for hunk in &diff.hunks {
201            content.push_str(&hunk.content);
202        }
203        content
204    }
205
206    fn count_changed_lines(diff: &FileDiff) -> usize {
207        let mut count = 0;
208        for hunk in &diff.hunks {
209            for line in hunk.content.lines() {
210                if line.starts_with('+') || line.starts_with('-') {
211                    count += 1;
212                }
213            }
214        }
215        count
216    }
217}
218
219/// Result of filtering diffs.
220///
221/// # Examples
222///
223/// ```
224/// use argus_difflens::filter::FilterResult;
225///
226/// let result = FilterResult {
227///     kept: vec![],
228///     skipped: vec![],
229/// };
230/// assert!(result.kept.is_empty());
231/// ```
232pub struct FilterResult {
233    /// Diffs that passed the filter.
234    pub kept: Vec<FileDiff>,
235    /// Files that were skipped with reasons.
236    pub skipped: Vec<SkippedFile>,
237}
238
239/// A file that was skipped during filtering.
240///
241/// # Examples
242///
243/// ```
244/// use std::path::PathBuf;
245/// use argus_difflens::filter::{SkippedFile, SkipReason};
246///
247/// let skipped = SkippedFile {
248///     path: PathBuf::from("package-lock.json"),
249///     reason: SkipReason::LockFile,
250/// };
251/// assert!(matches!(skipped.reason, SkipReason::LockFile));
252/// ```
253#[derive(Debug, Clone)]
254pub struct SkippedFile {
255    /// Path of the skipped file.
256    pub path: PathBuf,
257    /// Why the file was skipped.
258    pub reason: SkipReason,
259}
260
261/// Reason a file was skipped.
262///
263/// # Examples
264///
265/// ```
266/// use argus_difflens::filter::SkipReason;
267///
268/// let reason = SkipReason::LockFile;
269/// assert_eq!(format!("{reason}"), "lock file");
270/// ```
271#[derive(Debug, Clone)]
272pub enum SkipReason {
273    /// Package manager lock file.
274    LockFile,
275    /// Auto-generated code.
276    GeneratedFile,
277    /// Third-party vendored code.
278    VendoredCode,
279    /// Minified or bundled file.
280    MinifiedFile,
281    /// Binary file.
282    BinaryFile,
283    /// File exceeds max changed lines threshold.
284    TooLarge,
285    /// Matched a custom skip pattern.
286    PatternMatch(String),
287}
288
289impl std::fmt::Display for SkipReason {
290    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
291        match self {
292            SkipReason::LockFile => write!(f, "lock file"),
293            SkipReason::GeneratedFile => write!(f, "generated file"),
294            SkipReason::VendoredCode => write!(f, "vendored code"),
295            SkipReason::MinifiedFile => write!(f, "minified file"),
296            SkipReason::BinaryFile => write!(f, "binary file"),
297            SkipReason::TooLarge => write!(f, "too large"),
298            SkipReason::PatternMatch(pat) => write!(f, "pattern: {pat}"),
299        }
300    }
301}
302
303const LOCK_FILES: &[&str] = &[
304    "package-lock.json",
305    "yarn.lock",
306    "Cargo.lock",
307    "pnpm-lock.yaml",
308    "poetry.lock",
309    "Gemfile.lock",
310    "composer.lock",
311    "go.sum",
312];
313
314fn is_lock_file(file_name: &str) -> bool {
315    LOCK_FILES.contains(&file_name)
316}
317
318fn is_vendored(path: &str) -> bool {
319    let parts: Vec<&str> = path.split('/').collect();
320    for part in &parts {
321        if *part == "vendor" || *part == "third_party" || *part == "node_modules" {
322            return true;
323        }
324    }
325    false
326}
327
328fn is_minified(file_name: &str, content: &str) -> bool {
329    if file_name.ends_with(".min.js") || file_name.ends_with(".min.css") {
330        return true;
331    }
332    // Heuristic: any line longer than 500 chars suggests minification
333    for line in content.lines() {
334        if line.len() > 500 {
335            return true;
336        }
337    }
338    false
339}
340
341fn is_generated_by_name(file_name: &str) -> bool {
342    if file_name.contains(".generated.") {
343        return true;
344    }
345    if file_name.ends_with(".g.dart") {
346        return true;
347    }
348    if file_name.ends_with(".pb.go") || file_name.ends_with(".pb.rs") {
349        return true;
350    }
351    false
352}
353
354fn is_generated_by_content(content: &str) -> bool {
355    let mut line_count = 0;
356    for line in content.lines() {
357        // Only look at added lines for header detection
358        let check_line = if let Some(stripped) = line.strip_prefix('+') {
359            stripped
360        } else if line.starts_with('-') || line.starts_with(' ') {
361            // Also check context/removed lines (the file might have the header already)
362            &line[1..]
363        } else {
364            line
365        };
366
367        if check_line.contains("// Code generated") || check_line.contains("# AUTO-GENERATED") {
368            return true;
369        }
370        line_count += 1;
371        if line_count >= 5 {
372            break;
373        }
374    }
375    false
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381    use crate::parser::parse_unified_diff;
382
383    fn make_diff(path: &str, content: &str) -> Vec<FileDiff> {
384        let diff = format!(
385            "diff --git a/{path} b/{path}\n\
386             --- a/{path}\n\
387             +++ b/{path}\n\
388             @@ -1,1 +1,2 @@\n\
389             {content}\n"
390        );
391        parse_unified_diff(&diff).unwrap()
392    }
393
394    #[test]
395    fn lock_files_skipped() {
396        let filter = DiffFilter::default_filter();
397        for name in LOCK_FILES {
398            let diffs = make_diff(name, "+new line");
399            let result = filter.filter(diffs);
400            assert!(result.kept.is_empty(), "expected {name} to be skipped");
401            assert_eq!(result.skipped.len(), 1);
402            assert!(matches!(result.skipped[0].reason, SkipReason::LockFile));
403        }
404    }
405
406    #[test]
407    fn generated_files_skipped_by_name() {
408        let filter = DiffFilter::default_filter();
409
410        for name in &[
411            "api.generated.ts",
412            "model.g.dart",
413            "proto.pb.go",
414            "msg.pb.rs",
415        ] {
416            let diffs = make_diff(name, "+new line");
417            let result = filter.filter(diffs);
418            assert!(result.kept.is_empty(), "expected {name} to be skipped");
419            assert!(matches!(
420                result.skipped[0].reason,
421                SkipReason::GeneratedFile
422            ));
423        }
424    }
425
426    #[test]
427    fn generated_files_skipped_by_header() {
428        let filter = DiffFilter::default_filter();
429        let diffs = make_diff("gen.go", "+// Code generated by protoc. DO NOT EDIT.");
430        let result = filter.filter(diffs);
431        assert!(result.kept.is_empty());
432        assert!(matches!(
433            result.skipped[0].reason,
434            SkipReason::GeneratedFile
435        ));
436    }
437
438    #[test]
439    fn minified_files_skipped() {
440        let filter = DiffFilter::default_filter();
441
442        // By name
443        let diffs = make_diff("app.min.js", "+var x=1;");
444        let result = filter.filter(diffs);
445        assert!(result.kept.is_empty());
446        assert!(matches!(result.skipped[0].reason, SkipReason::MinifiedFile));
447
448        // By long line heuristic
449        let long_line = format!("+{}", "x".repeat(501));
450        let diffs = make_diff("bundle.js", &long_line);
451        let result = filter.filter(diffs);
452        assert!(result.kept.is_empty());
453        assert!(matches!(result.skipped[0].reason, SkipReason::MinifiedFile));
454    }
455
456    #[test]
457    fn vendored_code_skipped() {
458        let filter = DiffFilter::default_filter();
459
460        for path in &[
461            "vendor/lib.go",
462            "third_party/dep.rs",
463            "node_modules/pkg/index.js",
464        ] {
465            let diffs = make_diff(path, "+line");
466            let result = filter.filter(diffs);
467            assert!(result.kept.is_empty(), "expected {path} to be skipped");
468            assert!(matches!(result.skipped[0].reason, SkipReason::VendoredCode));
469        }
470    }
471
472    #[test]
473    fn normal_source_files_kept() {
474        let filter = DiffFilter::default_filter();
475        let diffs = make_diff("src/main.rs", "+let x = 1;");
476        let result = filter.filter(diffs);
477        assert_eq!(result.kept.len(), 1);
478        assert!(result.skipped.is_empty());
479    }
480
481    #[test]
482    fn custom_patterns_from_config() {
483        let config = ReviewConfig {
484            skip_patterns: vec!["*.test.ts".into(), "fixtures/**".into()],
485            ..ReviewConfig::default()
486        };
487        let filter = DiffFilter::from_config(&config);
488
489        let diffs = make_diff("auth.test.ts", "+test line");
490        let result = filter.filter(diffs);
491        assert!(result.kept.is_empty());
492        assert!(matches!(
493            result.skipped[0].reason,
494            SkipReason::PatternMatch(_)
495        ));
496
497        // Normal file still kept
498        let diffs = make_diff("src/auth.ts", "+real code");
499        let result = filter.filter(diffs);
500        assert_eq!(result.kept.len(), 1);
501    }
502
503    #[test]
504    fn custom_extensions_from_config() {
505        let config = ReviewConfig {
506            skip_extensions: vec!["snap".into()],
507            ..ReviewConfig::default()
508        };
509        let filter = DiffFilter::from_config(&config);
510
511        let diffs = make_diff("component.test.snap", "+snapshot content");
512        let result = filter.filter(diffs);
513        assert!(result.kept.is_empty());
514    }
515
516    #[test]
517    fn empty_diff_returns_empty_result() {
518        let filter = DiffFilter::default_filter();
519        let result = filter.filter(Vec::new());
520        assert!(result.kept.is_empty());
521        assert!(result.skipped.is_empty());
522    }
523
524    #[test]
525    fn too_large_files_skipped() {
526        let filter = DiffFilter::default_filter();
527        // Generate >1000 changed lines
528        let mut lines = String::new();
529        for i in 0..1002 {
530            lines.push_str(&format!("+line {i}\n"));
531        }
532        let diff = format!(
533            "diff --git a/big.rs b/big.rs\n\
534             --- a/big.rs\n\
535             +++ b/big.rs\n\
536             @@ -1,1 +1,1003 @@\n\
537             {lines}"
538        );
539        let diffs = parse_unified_diff(&diff).unwrap();
540        let result = filter.filter(diffs);
541        assert!(result.kept.is_empty());
542        assert!(matches!(result.skipped[0].reason, SkipReason::TooLarge));
543    }
544}