Skip to main content

semantic_diff/diff/
parser.rs

1use super::{DiffData, DiffFile, DiffLine, DiffSegment, Hunk, LineType, SegmentTag};
2use similar::{ChangeTag, TextDiff};
3
4/// Parse raw `git diff HEAD -M` output into structured DiffData.
5pub fn parse(raw: &str) -> DiffData {
6    // 1. Detect binary files
7    let mut binary_files = Vec::new();
8    for line in raw.lines() {
9        if line.starts_with("Binary files ") && line.ends_with(" differ") {
10            if let Some(path) = extract_binary_path(line) {
11                binary_files.push(path);
12            }
13        }
14    }
15
16    // 2. Parse with unidiff
17    let mut patch = unidiff::PatchSet::new();
18    let _ = patch.parse(raw);
19
20    // 3. Convert to our types, validating paths against traversal attacks
21    let files = patch
22        .files()
23        .iter()
24        .filter_map(|pf| {
25            let source = validate_diff_path(&pf.source_file).unwrap_or_default();
26            let target = validate_diff_path(&pf.target_file).unwrap_or_default();
27
28            // Skip files with invalid target paths (traversal, absolute, etc.)
29            if target.is_empty() {
30                return None;
31            }
32
33            // Best-effort symlink resolution (file may not exist on disk)
34            let target = resolve_if_symlink(&target);
35
36            let is_rename = is_rename_file(&source, &target);
37
38            let hunks = pf
39                .hunks()
40                .iter()
41                .map(|h| {
42                    let lines = h
43                        .lines()
44                        .iter()
45                        .filter_map(|line| {
46                            let content = line.value.clone();
47                            // Skip "No newline at end of file" markers
48                            if content.starts_with("\\ No newline") {
49                                return None;
50                            }
51                            let line_type = match line.line_type.as_str() {
52                                "+" => LineType::Added,
53                                "-" => LineType::Removed,
54                                _ => LineType::Context,
55                            };
56                            Some(DiffLine {
57                                line_type,
58                                content,
59                                inline_segments: None,
60                            })
61                        })
62                        .collect();
63
64                    let mut hunk = Hunk {
65                        header: format!(
66                            "@@ -{},{} +{},{} @@",
67                            h.source_start,
68                            h.source_length,
69                            h.target_start,
70                            h.target_length
71                        ),
72                        source_start: h.source_start,
73                        target_start: h.target_start,
74                        lines,
75                    };
76                    compute_inline_diffs(&mut hunk);
77                    hunk
78                })
79                .collect();
80
81            Some(DiffFile {
82                source_file: source,
83                target_file: target,
84                is_rename,
85                is_untracked: false,
86                hunks,
87                added_count: pf.added(),
88                removed_count: pf.removed(),
89            })
90        })
91        .collect();
92
93    DiffData {
94        files,
95        binary_files,
96    }
97}
98
99/// Check if a PatchedFile represents a rename.
100fn is_rename_file(source: &str, target: &str) -> bool {
101    let s = source.trim_start_matches("a/");
102    let t = target.trim_start_matches("b/");
103    s != t && source != "/dev/null" && target != "/dev/null"
104}
105
106/// Compute word-level inline diffs for paired removed/added lines in a hunk.
107///
108/// Walks through the hunk's lines, finds consecutive sequences of Removed lines
109/// followed by Added lines, and pairs them 1:1 for word-level diffing.
110/// Lines longer than 500 characters skip inline diff for performance.
111pub fn compute_inline_diffs(hunk: &mut Hunk) {
112    let len = hunk.lines.len();
113    let mut i = 0;
114
115    while i < len {
116        // Find a run of Removed lines
117        let removed_start = i;
118        while i < len && hunk.lines[i].line_type == LineType::Removed {
119            i += 1;
120        }
121        let removed_end = i;
122
123        // Find a following run of Added lines
124        let added_start = i;
125        while i < len && hunk.lines[i].line_type == LineType::Added {
126            i += 1;
127        }
128        let added_end = i;
129
130        let removed_count = removed_end - removed_start;
131        let added_count = added_end - added_start;
132
133        // If we found both removed and added lines, pair them
134        if removed_count > 0 && added_count > 0 {
135            let pairs = removed_count.min(added_count);
136            for p in 0..pairs {
137                let ri = removed_start + p;
138                let ai = added_start + p;
139
140                let old_content = &hunk.lines[ri].content;
141                let new_content = &hunk.lines[ai].content;
142
143                // Performance guard: skip long lines
144                if old_content.len() > 500 || new_content.len() > 500 {
145                    continue;
146                }
147
148                let diff = TextDiff::from_words(old_content.as_str(), new_content.as_str());
149
150                let mut old_segments = Vec::new();
151                let mut new_segments = Vec::new();
152
153                for change in diff.iter_all_changes() {
154                    let text = change.value().to_string();
155                    match change.tag() {
156                        ChangeTag::Equal => {
157                            old_segments.push(DiffSegment {
158                                tag: SegmentTag::Equal,
159                                text: text.clone(),
160                            });
161                            new_segments.push(DiffSegment {
162                                tag: SegmentTag::Equal,
163                                text,
164                            });
165                        }
166                        ChangeTag::Delete => {
167                            old_segments.push(DiffSegment {
168                                tag: SegmentTag::Changed,
169                                text,
170                            });
171                        }
172                        ChangeTag::Insert => {
173                            new_segments.push(DiffSegment {
174                                tag: SegmentTag::Changed,
175                                text,
176                            });
177                        }
178                    }
179                }
180
181                hunk.lines[ri].inline_segments = Some(old_segments);
182                hunk.lines[ai].inline_segments = Some(new_segments);
183            }
184        }
185
186        // If we didn't advance (e.g., context line), move forward
187        if i == removed_start {
188            i += 1;
189        }
190    }
191}
192
193/// Extract file path from "Binary files a/path and b/path differ" line.
194fn extract_binary_path(line: &str) -> Option<String> {
195    // Format: "Binary files a/path and b/path differ"
196    let rest = line.strip_prefix("Binary files ")?;
197    let rest = rest.strip_suffix(" differ")?;
198    // Split on " and " to get the two paths
199    let parts: Vec<&str> = rest.splitn(2, " and ").collect();
200    if parts.len() == 2 {
201        // Use the target (b/) path, stripping the prefix
202        let target = parts[1].trim_start_matches("b/");
203        // Validate the extracted path against traversal attacks
204        validate_diff_path(target)
205    } else {
206        None
207    }
208}
209
210/// Validate a file path from diff output. Rejects traversal and absolute paths.
211fn validate_diff_path(path: &str) -> Option<String> {
212    // Strip a/ or b/ prefix if present (unidiff convention)
213    let path = path.trim_start_matches("a/").trim_start_matches("b/");
214    // Reject absolute paths
215    if path.starts_with('/') {
216        tracing::warn!("Rejected absolute path from diff: {}", path);
217        return None;
218    }
219    // Reject path traversal (.. components)
220    if path.split('/').any(|component| component == "..") {
221        tracing::warn!("Rejected traversal path from diff: {}", path);
222        return None;
223    }
224    // Reject paths containing null bytes
225    if path.contains('\0') {
226        tracing::warn!("Rejected path with null byte from diff");
227        return None;
228    }
229    Some(path.to_string())
230}
231
232/// Resolve symlinks in a path, validating the resolved path stays within the repo.
233/// Best-effort: returns original path if file doesn't exist or isn't a symlink.
234fn resolve_if_symlink(path: &str) -> String {
235    let p = std::path::Path::new(path);
236    // Check if it's a symlink
237    match std::fs::symlink_metadata(p) {
238        Ok(meta) if meta.file_type().is_symlink() => {
239            match std::fs::canonicalize(p) {
240                Ok(resolved) => {
241                    // Validate resolved path is within cwd
242                    if let Ok(cwd) = std::env::current_dir() {
243                        let canonical_cwd = std::fs::canonicalize(&cwd).unwrap_or(cwd);
244                        if resolved.starts_with(&canonical_cwd) {
245                            resolved.to_string_lossy().to_string()
246                        } else {
247                            tracing::warn!(
248                                "Symlink {} resolves outside repo root to {}, using original path",
249                                path,
250                                resolved.display()
251                            );
252                            path.to_string()
253                        }
254                    } else {
255                        path.to_string()
256                    }
257                }
258                Err(_) => path.to_string(),
259            }
260        }
261        _ => path.to_string(),
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_validate_diff_path_normal() {
271        assert_eq!(
272            validate_diff_path("src/main.rs"),
273            Some("src/main.rs".to_string())
274        );
275    }
276
277    #[test]
278    fn test_validate_diff_path_traversal_rejected() {
279        assert_eq!(validate_diff_path("../../../etc/passwd"), None);
280    }
281
282    #[test]
283    fn test_validate_diff_path_embedded_traversal_rejected() {
284        assert_eq!(validate_diff_path("src/../lib.rs"), None);
285    }
286
287    #[test]
288    fn test_validate_diff_path_absolute_rejected() {
289        assert_eq!(validate_diff_path("/etc/passwd"), None);
290    }
291
292    #[test]
293    fn test_validate_diff_path_normal_nested() {
294        assert_eq!(
295            validate_diff_path("normal/path/file.rs"),
296            Some("normal/path/file.rs".to_string())
297        );
298    }
299
300    #[test]
301    fn test_validate_diff_path_strips_prefix() {
302        assert_eq!(
303            validate_diff_path("b/src/main.rs"),
304            Some("src/main.rs".to_string())
305        );
306        assert_eq!(
307            validate_diff_path("a/src/main.rs"),
308            Some("src/main.rs".to_string())
309        );
310    }
311
312    #[test]
313    fn test_validate_diff_path_null_byte_rejected() {
314        assert_eq!(validate_diff_path("src/\0evil.rs"), None);
315    }
316
317    #[test]
318    fn test_extract_binary_path_with_traversal_returns_none() {
319        let line = "Binary files a/normal.png and b/../../../etc/shadow differ";
320        assert_eq!(extract_binary_path(line), None);
321    }
322
323    #[test]
324    fn test_extract_binary_path_valid() {
325        let line = "Binary files a/icon.png and b/icon.png differ";
326        assert_eq!(extract_binary_path(line), Some("icon.png".to_string()));
327    }
328
329    #[test]
330    fn test_parse_with_traversal_path_skipped() {
331        // Craft a minimal diff with traversal in the filename
332        let raw = "diff --git a/../../../etc/passwd b/../../../etc/passwd\n\
333                   --- a/../../../etc/passwd\n\
334                   +++ b/../../../etc/passwd\n\
335                   @@ -0,0 +1 @@\n\
336                   +malicious content\n";
337        let result = parse(raw);
338        // The traversal path should be filtered out
339        assert!(
340            result.files.iter().all(|f| !f.target_file.contains("..")),
341            "Traversal paths should be rejected"
342        );
343    }
344
345    #[test]
346    fn test_resolve_if_symlink_nonexistent() {
347        // Non-existent file should return original path
348        let result = resolve_if_symlink("nonexistent/path/file.rs");
349        assert_eq!(result, "nonexistent/path/file.rs");
350    }
351}