Skip to main content

rma_analyzer/
diff.rs

1//! Diff-aware analysis for PR workflows
2//!
3//! This module provides functionality to filter findings to only those
4//! affecting lines changed in a PR/diff. This is useful for:
5//! - PR workflows where you only want to see new issues
6//! - Incremental analysis to reduce noise
7//! - CI pipelines that focus on changed code
8
9use anyhow::{Context, Result};
10use rma_common::Finding;
11use std::collections::{HashMap, HashSet};
12use std::io::{self, BufRead};
13use std::path::PathBuf;
14use std::process::Command;
15
16/// Map of file paths to the set of line numbers that were changed
17pub type ChangedLines = HashMap<PathBuf, HashSet<usize>>;
18
19/// Get changed lines by running `git diff` against a base reference
20///
21/// This runs `git diff --unified=0` to get a minimal diff showing only
22/// the exact lines that changed.
23///
24/// # Arguments
25/// * `project_root` - The root directory of the git repository
26/// * `base_ref` - The git reference to compare against (e.g., "origin/main")
27///
28/// # Returns
29/// A map of file paths to the set of changed line numbers
30pub fn get_changed_lines_from_git(project_root: &PathBuf, base_ref: &str) -> Result<ChangedLines> {
31    // Run git diff with unified=0 to get exact changed lines
32    let output = Command::new("git")
33        .args(["diff", "--unified=0", base_ref])
34        .current_dir(project_root)
35        .output()
36        .context("Failed to run git diff")?;
37
38    if !output.status.success() {
39        // Try fetching the remote first
40        let _ = Command::new("git")
41            .args(["fetch", "origin"])
42            .current_dir(project_root)
43            .output();
44
45        // Retry the diff
46        let output = Command::new("git")
47            .args(["diff", "--unified=0", base_ref])
48            .current_dir(project_root)
49            .output()
50            .context("Failed to run git diff after fetch")?;
51
52        if !output.status.success() {
53            anyhow::bail!(
54                "git diff failed: {}",
55                String::from_utf8_lossy(&output.stderr)
56            );
57        }
58
59        let diff_text = String::from_utf8_lossy(&output.stdout);
60        return parse_unified_diff(&diff_text, Some(project_root));
61    }
62
63    let diff_text = String::from_utf8_lossy(&output.stdout);
64    parse_unified_diff(&diff_text, Some(project_root))
65}
66
67/// Get changed lines from unified diff text read from stdin
68///
69/// This is useful for piping diff output from other sources.
70///
71/// # Returns
72/// A map of file paths to the set of changed line numbers
73pub fn get_changed_lines_from_stdin() -> Result<ChangedLines> {
74    let stdin = io::stdin();
75    let mut diff_text = String::new();
76
77    for line in stdin.lock().lines() {
78        let line = line.context("Failed to read line from stdin")?;
79        diff_text.push_str(&line);
80        diff_text.push('\n');
81    }
82
83    parse_unified_diff(&diff_text, None)
84}
85
86/// Parse a unified diff and extract the changed line numbers for each file
87///
88/// Handles standard git unified diff format:
89/// ```text
90/// diff --git a/file.rs b/file.rs
91/// --- a/file.rs
92/// +++ b/file.rs
93/// @@ -10,3 +10,5 @@ fn example() {
94///  unchanged
95/// +added line 1
96/// +added line 2
97///  unchanged
98/// ```
99///
100/// # Arguments
101/// * `diff_text` - The unified diff text
102/// * `project_root` - Optional project root to resolve relative paths
103///
104/// # Returns
105/// A map of file paths to the set of changed line numbers (in the new version)
106pub fn parse_unified_diff(diff_text: &str, project_root: Option<&PathBuf>) -> Result<ChangedLines> {
107    let mut changed_lines: ChangedLines = HashMap::new();
108    let mut current_file: Option<PathBuf> = None;
109
110    for line in diff_text.lines() {
111        // Handle file header: +++ b/path/to/file
112        if line.starts_with("+++ ") {
113            let path_str = line
114                .strip_prefix("+++ ")
115                .unwrap()
116                .strip_prefix("b/")
117                .unwrap_or(line.strip_prefix("+++ ").unwrap());
118
119            // Handle /dev/null for new files
120            if path_str == "/dev/null" {
121                current_file = None;
122                continue;
123            }
124
125            let file_path = if let Some(root) = project_root {
126                root.join(path_str)
127            } else {
128                PathBuf::from(path_str)
129            };
130
131            current_file = Some(file_path);
132        }
133        // Handle hunk header: @@ -old_start,old_count +new_start,new_count @@
134        else if line.starts_with("@@ ")
135            && let Some(ref file) = current_file
136            && let Some((new_start, new_count)) = parse_hunk_header(line)
137        {
138            let lines = changed_lines.entry(file.clone()).or_default();
139            // Add all lines in the new range
140            for line_num in new_start..new_start + new_count {
141                lines.insert(line_num);
142            }
143        }
144    }
145
146    Ok(changed_lines)
147}
148
149/// Parse a hunk header to extract the new file line range
150///
151/// Format: @@ -old_start,old_count +new_start,new_count @@ optional context
152/// Or:     @@ -old_start +new_start,new_count @@
153/// Or:     @@ -old_start,old_count +new_start @@
154///
155/// Returns (new_start, new_count) where count defaults to 1 if not specified
156fn parse_hunk_header(line: &str) -> Option<(usize, usize)> {
157    // Extract the part between @@ markers
158    let parts: Vec<&str> = line.split("@@").collect();
159    if parts.len() < 2 {
160        return None;
161    }
162
163    let range_part = parts[1].trim();
164
165    // Find the + part (new file range)
166    for part in range_part.split_whitespace() {
167        if part.starts_with('+') {
168            let range_str = part.strip_prefix('+')?;
169
170            // Parse "start,count" or just "start"
171            if let Some((start_str, count_str)) = range_str.split_once(',') {
172                let start = start_str.parse().ok()?;
173                let count = count_str.parse().ok()?;
174                return Some((start, count));
175            } else {
176                let start = range_str.parse().ok()?;
177                // If no count specified, it means 1 line changed
178                return Some((start, 1));
179            }
180        }
181    }
182
183    None
184}
185
186/// Filter findings to only include those on changed lines
187///
188/// This compares each finding's location against the changed lines map
189/// and keeps only findings that are on lines that were modified.
190///
191/// # Arguments
192/// * `findings` - The list of findings to filter
193/// * `changed_lines` - Map of file paths to changed line numbers
194///
195/// # Returns
196/// A filtered vector containing only findings on changed lines
197pub fn filter_findings_by_diff(
198    findings: Vec<Finding>,
199    changed_lines: &ChangedLines,
200) -> Vec<Finding> {
201    findings
202        .into_iter()
203        .filter(|finding| {
204            let file_path = &finding.location.file;
205
206            // Check if this file has any changed lines
207            if let Some(lines) = changed_lines.get(file_path) {
208                // Check if any line in the finding's range is changed
209                for line in finding.location.start_line..=finding.location.end_line {
210                    if lines.contains(&line) {
211                        return true;
212                    }
213                }
214                false
215            } else {
216                // Try matching with just the filename for relative path handling
217                let file_name = file_path.file_name();
218                for (changed_path, lines) in changed_lines.iter() {
219                    // Check if paths match (handling relative vs absolute)
220                    let paths_match = changed_path == file_path
221                        || changed_path.ends_with(file_path)
222                        || file_path.ends_with(changed_path)
223                        || (file_name.is_some() && changed_path.file_name() == file_name);
224
225                    if paths_match {
226                        for line in finding.location.start_line..=finding.location.end_line {
227                            if lines.contains(&line) {
228                                return true;
229                            }
230                        }
231                    }
232                }
233                false
234            }
235        })
236        .collect()
237}
238
239/// Check if a file is in the changed files set (regardless of line numbers)
240///
241/// This is useful for filtering files before analysis.
242pub fn is_file_changed(file_path: &PathBuf, changed_lines: &ChangedLines) -> bool {
243    if changed_lines.contains_key(file_path) {
244        return true;
245    }
246
247    // Try matching with just the filename for relative path handling
248    let file_name = file_path.file_name();
249    for changed_path in changed_lines.keys() {
250        if changed_path.ends_with(file_path)
251            || file_path.ends_with(changed_path)
252            || (file_name.is_some() && changed_path.file_name() == file_name)
253        {
254            return true;
255        }
256    }
257
258    false
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264    use rma_common::{Language, Severity, SourceLocation};
265
266    #[test]
267    fn test_parse_hunk_header_basic() {
268        // Standard format: @@ -10,3 +10,5 @@
269        let result = parse_hunk_header("@@ -10,3 +10,5 @@");
270        assert_eq!(result, Some((10, 5)));
271    }
272
273    #[test]
274    fn test_parse_hunk_header_no_count() {
275        // Single line change: @@ -10 +10 @@
276        let result = parse_hunk_header("@@ -10 +10 @@");
277        assert_eq!(result, Some((10, 1)));
278    }
279
280    #[test]
281    fn test_parse_hunk_header_with_context() {
282        // With function context: @@ -10,3 +10,5 @@ fn example() {
283        let result = parse_hunk_header("@@ -10,3 +10,5 @@ fn example() {");
284        assert_eq!(result, Some((10, 5)));
285    }
286
287    #[test]
288    fn test_parse_hunk_header_single_line_new_count() {
289        // @@ -5,0 +6,2 @@ - adding 2 lines after line 5
290        let result = parse_hunk_header("@@ -5,0 +6,2 @@");
291        assert_eq!(result, Some((6, 2)));
292    }
293
294    #[test]
295    fn test_parse_hunk_header_deletion_only() {
296        // @@ -10,3 +10,0 @@ - removing 3 lines
297        // This represents 0 lines in the new file at position 10
298        let result = parse_hunk_header("@@ -10,3 +10,0 @@");
299        assert_eq!(result, Some((10, 0)));
300    }
301
302    #[test]
303    fn test_parse_unified_diff_simple() {
304        let diff = r#"diff --git a/src/main.rs b/src/main.rs
305index abc123..def456 100644
306--- a/src/main.rs
307+++ b/src/main.rs
308@@ -10,3 +10,5 @@ fn main() {
309 unchanged
310+added line 1
311+added line 2
312 unchanged
313"#;
314
315        let result = parse_unified_diff(diff, None).unwrap();
316        assert!(result.contains_key(&PathBuf::from("src/main.rs")));
317
318        let lines = result.get(&PathBuf::from("src/main.rs")).unwrap();
319        // Lines 10-14 are in the hunk (5 lines starting at 10)
320        assert!(lines.contains(&10));
321        assert!(lines.contains(&11));
322        assert!(lines.contains(&12));
323        assert!(lines.contains(&13));
324        assert!(lines.contains(&14));
325    }
326
327    #[test]
328    fn test_parse_unified_diff_multiple_hunks() {
329        let diff = r#"diff --git a/src/lib.rs b/src/lib.rs
330--- a/src/lib.rs
331+++ b/src/lib.rs
332@@ -5,2 +5,3 @@
333 line
334+new line at 6
335 line
336@@ -20,1 +21,2 @@
337 old
338+new line at 22
339"#;
340
341        let result = parse_unified_diff(diff, None).unwrap();
342        let lines = result.get(&PathBuf::from("src/lib.rs")).unwrap();
343
344        // First hunk: lines 5-7 (3 lines starting at 5)
345        assert!(lines.contains(&5));
346        assert!(lines.contains(&6));
347        assert!(lines.contains(&7));
348
349        // Second hunk: lines 21-22 (2 lines starting at 21)
350        assert!(lines.contains(&21));
351        assert!(lines.contains(&22));
352    }
353
354    #[test]
355    fn test_parse_unified_diff_new_file() {
356        let diff = r#"diff --git a/src/new_file.rs b/src/new_file.rs
357new file mode 100644
358index 0000000..abc123
359--- /dev/null
360+++ b/src/new_file.rs
361@@ -0,0 +1,10 @@
362+fn new_function() {
363+    println!("hello");
364+}
365"#;
366
367        let result = parse_unified_diff(diff, None).unwrap();
368        assert!(result.contains_key(&PathBuf::from("src/new_file.rs")));
369
370        let lines = result.get(&PathBuf::from("src/new_file.rs")).unwrap();
371        // All 10 lines are new (lines 1-10)
372        for i in 1..=10 {
373            assert!(lines.contains(&i), "Line {} should be marked as changed", i);
374        }
375    }
376
377    #[test]
378    fn test_parse_unified_diff_deleted_file() {
379        let diff = r#"diff --git a/src/old_file.rs b/src/old_file.rs
380deleted file mode 100644
381index abc123..0000000
382--- a/src/old_file.rs
383+++ /dev/null
384@@ -1,10 +0,0 @@
385-fn old_function() {
386-    println!("goodbye");
387-}
388"#;
389
390        let result = parse_unified_diff(diff, None).unwrap();
391        // Deleted files have no changed lines in the new version
392        assert!(!result.contains_key(&PathBuf::from("src/old_file.rs")));
393    }
394
395    #[test]
396    fn test_parse_unified_diff_renamed_file() {
397        let diff = r#"diff --git a/src/old_name.rs b/src/new_name.rs
398similarity index 95%
399rename from src/old_name.rs
400rename to src/new_name.rs
401index abc123..def456 100644
402--- a/src/old_name.rs
403+++ b/src/new_name.rs
404@@ -5,1 +5,2 @@
405 unchanged
406+added in renamed file
407"#;
408
409        let result = parse_unified_diff(diff, None).unwrap();
410        // The new file name should be tracked
411        assert!(result.contains_key(&PathBuf::from("src/new_name.rs")));
412
413        let lines = result.get(&PathBuf::from("src/new_name.rs")).unwrap();
414        assert!(lines.contains(&5));
415        assert!(lines.contains(&6));
416    }
417
418    #[test]
419    fn test_filter_findings_by_diff_keeps_changed() {
420        let mut changed_lines = ChangedLines::new();
421        changed_lines.insert(
422            PathBuf::from("src/main.rs"),
423            vec![10, 11, 12].into_iter().collect(),
424        );
425
426        let findings = vec![
427            create_test_finding("src/main.rs", 10, 10), // On changed line
428            create_test_finding("src/main.rs", 5, 5),   // Not on changed line
429            create_test_finding("src/main.rs", 11, 12), // Spans changed lines
430        ];
431
432        let filtered = filter_findings_by_diff(findings, &changed_lines);
433        assert_eq!(filtered.len(), 2);
434        assert_eq!(filtered[0].location.start_line, 10);
435        assert_eq!(filtered[1].location.start_line, 11);
436    }
437
438    #[test]
439    fn test_filter_findings_by_diff_removes_unchanged() {
440        let mut changed_lines = ChangedLines::new();
441        changed_lines.insert(
442            PathBuf::from("src/main.rs"),
443            vec![10, 11].into_iter().collect(),
444        );
445
446        let findings = vec![
447            create_test_finding("src/main.rs", 5, 5), // Before changed area
448            create_test_finding("src/main.rs", 20, 25), // After changed area
449            create_test_finding("src/other.rs", 10, 10), // Different file
450        ];
451
452        let filtered = filter_findings_by_diff(findings, &changed_lines);
453        assert!(filtered.is_empty());
454    }
455
456    #[test]
457    fn test_filter_findings_partial_overlap() {
458        let mut changed_lines = ChangedLines::new();
459        changed_lines.insert(
460            PathBuf::from("src/main.rs"),
461            vec![10, 11, 12].into_iter().collect(),
462        );
463
464        let findings = vec![
465            create_test_finding("src/main.rs", 8, 10), // Starts before, ends on changed
466            create_test_finding("src/main.rs", 12, 15), // Starts on changed, ends after
467        ];
468
469        let filtered = filter_findings_by_diff(findings, &changed_lines);
470        assert_eq!(filtered.len(), 2); // Both have partial overlap
471    }
472
473    #[test]
474    fn test_is_file_changed() {
475        let mut changed_lines = ChangedLines::new();
476        changed_lines.insert(PathBuf::from("src/main.rs"), vec![10].into_iter().collect());
477
478        assert!(is_file_changed(
479            &PathBuf::from("src/main.rs"),
480            &changed_lines
481        ));
482        assert!(!is_file_changed(
483            &PathBuf::from("src/other.rs"),
484            &changed_lines
485        ));
486    }
487
488    #[test]
489    fn test_path_matching_relative_absolute() {
490        let mut changed_lines = ChangedLines::new();
491        changed_lines.insert(
492            PathBuf::from("/project/src/main.rs"),
493            vec![10].into_iter().collect(),
494        );
495
496        // Should match when finding has relative path
497        let finding = create_test_finding("src/main.rs", 10, 10);
498        let filtered = filter_findings_by_diff(vec![finding], &changed_lines);
499        assert_eq!(filtered.len(), 1);
500    }
501
502    fn create_test_finding(file: &str, start_line: usize, end_line: usize) -> Finding {
503        Finding {
504            id: format!("test-{}-{}", file, start_line),
505            rule_id: "test-rule".to_string(),
506            message: "Test finding".to_string(),
507            severity: Severity::Warning,
508            location: SourceLocation {
509                file: PathBuf::from(file),
510                start_line,
511                start_column: 1,
512                end_line,
513                end_column: 1,
514            },
515            language: Language::Rust,
516            snippet: None,
517            suggestion: None,
518            fix: None,
519            confidence: rma_common::Confidence::Medium,
520            category: rma_common::FindingCategory::Security,
521            subcategory: None,
522            technology: None,
523            impact: None,
524            likelihood: None,
525            source: Default::default(),
526            fingerprint: None,
527            properties: None,
528            occurrence_count: None,
529            additional_locations: None,
530            ai_verdict: None,
531            ai_explanation: None,
532            ai_confidence: None,
533        }
534    }
535}