Skip to main content

semantic_diff/diff/
parser.rs

1use super::{DiffData, DiffFile, DiffLine, DiffSegment, Hunk, LineType, SegmentTag};
2use similar::{ChangeTag, TextDiff};
3
4/// Parse raw `git diff HEAD -M` output into structured DiffData.
5pub fn parse(raw: &str) -> DiffData {
6    // 1. Detect binary files
7    let mut binary_files = Vec::new();
8    for line in raw.lines() {
9        if line.starts_with("Binary files ") && line.ends_with(" differ") {
10            if let Some(path) = extract_binary_path(line) {
11                binary_files.push(path);
12            }
13        }
14    }
15
16    // 2. Parse with unidiff
17    let mut patch = unidiff::PatchSet::new();
18    let _ = patch.parse(raw);
19
20    // 3. Convert to our types, validating paths against traversal attacks
21    let files = patch
22        .files()
23        .iter()
24        .filter_map(|pf| {
25            let source = validate_diff_path(&pf.source_file).unwrap_or_default();
26            let target = validate_diff_path(&pf.target_file).unwrap_or_default();
27
28            // Skip files with invalid target paths (traversal, absolute, etc.)
29            if target.is_empty() {
30                return None;
31            }
32
33            // Best-effort symlink resolution (file may not exist on disk)
34            let target = resolve_if_symlink(&target);
35
36            let is_rename = is_rename_file(&source, &target);
37
38            let hunks = pf
39                .hunks()
40                .iter()
41                .map(|h| {
42                    let lines = h
43                        .lines()
44                        .iter()
45                        .filter_map(|line| {
46                            let content = line.value.clone();
47                            // Skip "No newline at end of file" markers
48                            if content.starts_with("\\ No newline") {
49                                return None;
50                            }
51                            let line_type = match line.line_type.as_str() {
52                                "+" => LineType::Added,
53                                "-" => LineType::Removed,
54                                _ => LineType::Context,
55                            };
56                            Some(DiffLine {
57                                line_type,
58                                content,
59                                inline_segments: None,
60                            })
61                        })
62                        .collect();
63
64                    let mut hunk = Hunk {
65                        header: format!(
66                            "@@ -{},{} +{},{} @@",
67                            h.source_start,
68                            h.source_length,
69                            h.target_start,
70                            h.target_length
71                        ),
72                        source_start: h.source_start,
73                        target_start: h.target_start,
74                        lines,
75                    };
76                    compute_inline_diffs(&mut hunk);
77                    hunk
78                })
79                .collect();
80
81            Some(DiffFile {
82                source_file: source,
83                target_file: target,
84                is_rename,
85                hunks,
86                added_count: pf.added(),
87                removed_count: pf.removed(),
88            })
89        })
90        .collect();
91
92    DiffData {
93        files,
94        binary_files,
95    }
96}
97
98/// Check if a PatchedFile represents a rename.
99fn is_rename_file(source: &str, target: &str) -> bool {
100    let s = source.trim_start_matches("a/");
101    let t = target.trim_start_matches("b/");
102    s != t && source != "/dev/null" && target != "/dev/null"
103}
104
105/// Compute word-level inline diffs for paired removed/added lines in a hunk.
106///
107/// Walks through the hunk's lines, finds consecutive sequences of Removed lines
108/// followed by Added lines, and pairs them 1:1 for word-level diffing.
109/// Lines longer than 500 characters skip inline diff for performance.
110pub fn compute_inline_diffs(hunk: &mut Hunk) {
111    let len = hunk.lines.len();
112    let mut i = 0;
113
114    while i < len {
115        // Find a run of Removed lines
116        let removed_start = i;
117        while i < len && hunk.lines[i].line_type == LineType::Removed {
118            i += 1;
119        }
120        let removed_end = i;
121
122        // Find a following run of Added lines
123        let added_start = i;
124        while i < len && hunk.lines[i].line_type == LineType::Added {
125            i += 1;
126        }
127        let added_end = i;
128
129        let removed_count = removed_end - removed_start;
130        let added_count = added_end - added_start;
131
132        // If we found both removed and added lines, pair them
133        if removed_count > 0 && added_count > 0 {
134            let pairs = removed_count.min(added_count);
135            for p in 0..pairs {
136                let ri = removed_start + p;
137                let ai = added_start + p;
138
139                let old_content = &hunk.lines[ri].content;
140                let new_content = &hunk.lines[ai].content;
141
142                // Performance guard: skip long lines
143                if old_content.len() > 500 || new_content.len() > 500 {
144                    continue;
145                }
146
147                let diff = TextDiff::from_words(old_content.as_str(), new_content.as_str());
148
149                let mut old_segments = Vec::new();
150                let mut new_segments = Vec::new();
151
152                for change in diff.iter_all_changes() {
153                    let text = change.value().to_string();
154                    match change.tag() {
155                        ChangeTag::Equal => {
156                            old_segments.push(DiffSegment {
157                                tag: SegmentTag::Equal,
158                                text: text.clone(),
159                            });
160                            new_segments.push(DiffSegment {
161                                tag: SegmentTag::Equal,
162                                text,
163                            });
164                        }
165                        ChangeTag::Delete => {
166                            old_segments.push(DiffSegment {
167                                tag: SegmentTag::Changed,
168                                text,
169                            });
170                        }
171                        ChangeTag::Insert => {
172                            new_segments.push(DiffSegment {
173                                tag: SegmentTag::Changed,
174                                text,
175                            });
176                        }
177                    }
178                }
179
180                hunk.lines[ri].inline_segments = Some(old_segments);
181                hunk.lines[ai].inline_segments = Some(new_segments);
182            }
183        }
184
185        // If we didn't advance (e.g., context line), move forward
186        if i == removed_start {
187            i += 1;
188        }
189    }
190}
191
192/// Extract file path from "Binary files a/path and b/path differ" line.
193fn extract_binary_path(line: &str) -> Option<String> {
194    // Format: "Binary files a/path and b/path differ"
195    let rest = line.strip_prefix("Binary files ")?;
196    let rest = rest.strip_suffix(" differ")?;
197    // Split on " and " to get the two paths
198    let parts: Vec<&str> = rest.splitn(2, " and ").collect();
199    if parts.len() == 2 {
200        // Use the target (b/) path, stripping the prefix
201        let target = parts[1].trim_start_matches("b/");
202        // Validate the extracted path against traversal attacks
203        validate_diff_path(target)
204    } else {
205        None
206    }
207}
208
209/// Validate a file path from diff output. Rejects traversal and absolute paths.
210fn validate_diff_path(path: &str) -> Option<String> {
211    // Strip a/ or b/ prefix if present (unidiff convention)
212    let path = path.trim_start_matches("a/").trim_start_matches("b/");
213    // Reject absolute paths
214    if path.starts_with('/') {
215        tracing::warn!("Rejected absolute path from diff: {}", path);
216        return None;
217    }
218    // Reject path traversal (.. components)
219    if path.split('/').any(|component| component == "..") {
220        tracing::warn!("Rejected traversal path from diff: {}", path);
221        return None;
222    }
223    // Reject paths containing null bytes
224    if path.contains('\0') {
225        tracing::warn!("Rejected path with null byte from diff");
226        return None;
227    }
228    Some(path.to_string())
229}
230
231/// Resolve symlinks in a path, validating the resolved path stays within the repo.
232/// Best-effort: returns original path if file doesn't exist or isn't a symlink.
233fn resolve_if_symlink(path: &str) -> String {
234    let p = std::path::Path::new(path);
235    // Check if it's a symlink
236    match std::fs::symlink_metadata(p) {
237        Ok(meta) if meta.file_type().is_symlink() => {
238            match std::fs::canonicalize(p) {
239                Ok(resolved) => {
240                    // Validate resolved path is within cwd
241                    if let Ok(cwd) = std::env::current_dir() {
242                        let canonical_cwd = std::fs::canonicalize(&cwd).unwrap_or(cwd);
243                        if resolved.starts_with(&canonical_cwd) {
244                            resolved.to_string_lossy().to_string()
245                        } else {
246                            tracing::warn!(
247                                "Symlink {} resolves outside repo root to {}, using original path",
248                                path,
249                                resolved.display()
250                            );
251                            path.to_string()
252                        }
253                    } else {
254                        path.to_string()
255                    }
256                }
257                Err(_) => path.to_string(),
258            }
259        }
260        _ => path.to_string(),
261    }
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn test_validate_diff_path_normal() {
270        assert_eq!(
271            validate_diff_path("src/main.rs"),
272            Some("src/main.rs".to_string())
273        );
274    }
275
276    #[test]
277    fn test_validate_diff_path_traversal_rejected() {
278        assert_eq!(validate_diff_path("../../../etc/passwd"), None);
279    }
280
281    #[test]
282    fn test_validate_diff_path_embedded_traversal_rejected() {
283        assert_eq!(validate_diff_path("src/../lib.rs"), None);
284    }
285
286    #[test]
287    fn test_validate_diff_path_absolute_rejected() {
288        assert_eq!(validate_diff_path("/etc/passwd"), None);
289    }
290
291    #[test]
292    fn test_validate_diff_path_normal_nested() {
293        assert_eq!(
294            validate_diff_path("normal/path/file.rs"),
295            Some("normal/path/file.rs".to_string())
296        );
297    }
298
299    #[test]
300    fn test_validate_diff_path_strips_prefix() {
301        assert_eq!(
302            validate_diff_path("b/src/main.rs"),
303            Some("src/main.rs".to_string())
304        );
305        assert_eq!(
306            validate_diff_path("a/src/main.rs"),
307            Some("src/main.rs".to_string())
308        );
309    }
310
311    #[test]
312    fn test_validate_diff_path_null_byte_rejected() {
313        assert_eq!(validate_diff_path("src/\0evil.rs"), None);
314    }
315
316    #[test]
317    fn test_extract_binary_path_with_traversal_returns_none() {
318        let line = "Binary files a/normal.png and b/../../../etc/shadow differ";
319        assert_eq!(extract_binary_path(line), None);
320    }
321
322    #[test]
323    fn test_extract_binary_path_valid() {
324        let line = "Binary files a/icon.png and b/icon.png differ";
325        assert_eq!(extract_binary_path(line), Some("icon.png".to_string()));
326    }
327
328    #[test]
329    fn test_parse_with_traversal_path_skipped() {
330        // Craft a minimal diff with traversal in the filename
331        let raw = "diff --git a/../../../etc/passwd b/../../../etc/passwd\n\
332                   --- a/../../../etc/passwd\n\
333                   +++ b/../../../etc/passwd\n\
334                   @@ -0,0 +1 @@\n\
335                   +malicious content\n";
336        let result = parse(raw);
337        // The traversal path should be filtered out
338        assert!(
339            result.files.iter().all(|f| !f.target_file.contains("..")),
340            "Traversal paths should be rejected"
341        );
342    }
343
344    #[test]
345    fn test_resolve_if_symlink_nonexistent() {
346        // Non-existent file should return original path
347        let result = resolve_if_symlink("nonexistent/path/file.rs");
348        assert_eq!(result, "nonexistent/path/file.rs");
349    }
350}