semantic_diff/diff/
parser.rs1use super::{DiffData, DiffFile, DiffLine, DiffSegment, Hunk, LineType, SegmentTag};
2use similar::{ChangeTag, TextDiff};
3
4pub fn parse(raw: &str) -> DiffData {
6 let mut binary_files = Vec::new();
8 for line in raw.lines() {
9 if line.starts_with("Binary files ") && line.ends_with(" differ") {
10 if let Some(path) = extract_binary_path(line) {
11 binary_files.push(path);
12 }
13 }
14 }
15
16 let mut patch = unidiff::PatchSet::new();
18 let _ = patch.parse(raw);
19
20 let files = patch
22 .files()
23 .iter()
24 .filter_map(|pf| {
25 let source = validate_diff_path(&pf.source_file).unwrap_or_default();
26 let target = validate_diff_path(&pf.target_file).unwrap_or_default();
27
28 if target.is_empty() {
30 return None;
31 }
32
33 let target = resolve_if_symlink(&target);
35
36 let is_rename = is_rename_file(&source, &target);
37
38 let hunks = pf
39 .hunks()
40 .iter()
41 .map(|h| {
42 let lines = h
43 .lines()
44 .iter()
45 .filter_map(|line| {
46 let content = line.value.clone();
47 if content.starts_with("\\ No newline") {
49 return None;
50 }
51 let line_type = match line.line_type.as_str() {
52 "+" => LineType::Added,
53 "-" => LineType::Removed,
54 _ => LineType::Context,
55 };
56 Some(DiffLine {
57 line_type,
58 content,
59 inline_segments: None,
60 })
61 })
62 .collect();
63
64 let mut hunk = Hunk {
65 header: format!(
66 "@@ -{},{} +{},{} @@",
67 h.source_start,
68 h.source_length,
69 h.target_start,
70 h.target_length
71 ),
72 source_start: h.source_start,
73 target_start: h.target_start,
74 lines,
75 };
76 compute_inline_diffs(&mut hunk);
77 hunk
78 })
79 .collect();
80
81 Some(DiffFile {
82 source_file: source,
83 target_file: target,
84 is_rename,
85 hunks,
86 added_count: pf.added(),
87 removed_count: pf.removed(),
88 })
89 })
90 .collect();
91
92 DiffData {
93 files,
94 binary_files,
95 }
96}
97
98fn is_rename_file(source: &str, target: &str) -> bool {
100 let s = source.trim_start_matches("a/");
101 let t = target.trim_start_matches("b/");
102 s != t && source != "/dev/null" && target != "/dev/null"
103}
104
105pub fn compute_inline_diffs(hunk: &mut Hunk) {
111 let len = hunk.lines.len();
112 let mut i = 0;
113
114 while i < len {
115 let removed_start = i;
117 while i < len && hunk.lines[i].line_type == LineType::Removed {
118 i += 1;
119 }
120 let removed_end = i;
121
122 let added_start = i;
124 while i < len && hunk.lines[i].line_type == LineType::Added {
125 i += 1;
126 }
127 let added_end = i;
128
129 let removed_count = removed_end - removed_start;
130 let added_count = added_end - added_start;
131
132 if removed_count > 0 && added_count > 0 {
134 let pairs = removed_count.min(added_count);
135 for p in 0..pairs {
136 let ri = removed_start + p;
137 let ai = added_start + p;
138
139 let old_content = &hunk.lines[ri].content;
140 let new_content = &hunk.lines[ai].content;
141
142 if old_content.len() > 500 || new_content.len() > 500 {
144 continue;
145 }
146
147 let diff = TextDiff::from_words(old_content.as_str(), new_content.as_str());
148
149 let mut old_segments = Vec::new();
150 let mut new_segments = Vec::new();
151
152 for change in diff.iter_all_changes() {
153 let text = change.value().to_string();
154 match change.tag() {
155 ChangeTag::Equal => {
156 old_segments.push(DiffSegment {
157 tag: SegmentTag::Equal,
158 text: text.clone(),
159 });
160 new_segments.push(DiffSegment {
161 tag: SegmentTag::Equal,
162 text,
163 });
164 }
165 ChangeTag::Delete => {
166 old_segments.push(DiffSegment {
167 tag: SegmentTag::Changed,
168 text,
169 });
170 }
171 ChangeTag::Insert => {
172 new_segments.push(DiffSegment {
173 tag: SegmentTag::Changed,
174 text,
175 });
176 }
177 }
178 }
179
180 hunk.lines[ri].inline_segments = Some(old_segments);
181 hunk.lines[ai].inline_segments = Some(new_segments);
182 }
183 }
184
185 if i == removed_start {
187 i += 1;
188 }
189 }
190}
191
192fn extract_binary_path(line: &str) -> Option<String> {
194 let rest = line.strip_prefix("Binary files ")?;
196 let rest = rest.strip_suffix(" differ")?;
197 let parts: Vec<&str> = rest.splitn(2, " and ").collect();
199 if parts.len() == 2 {
200 let target = parts[1].trim_start_matches("b/");
202 validate_diff_path(target)
204 } else {
205 None
206 }
207}
208
209fn validate_diff_path(path: &str) -> Option<String> {
211 let path = path.trim_start_matches("a/").trim_start_matches("b/");
213 if path.starts_with('/') {
215 tracing::warn!("Rejected absolute path from diff: {}", path);
216 return None;
217 }
218 if path.split('/').any(|component| component == "..") {
220 tracing::warn!("Rejected traversal path from diff: {}", path);
221 return None;
222 }
223 if path.contains('\0') {
225 tracing::warn!("Rejected path with null byte from diff");
226 return None;
227 }
228 Some(path.to_string())
229}
230
231fn resolve_if_symlink(path: &str) -> String {
234 let p = std::path::Path::new(path);
235 match std::fs::symlink_metadata(p) {
237 Ok(meta) if meta.file_type().is_symlink() => {
238 match std::fs::canonicalize(p) {
239 Ok(resolved) => {
240 if let Ok(cwd) = std::env::current_dir() {
242 let canonical_cwd = std::fs::canonicalize(&cwd).unwrap_or(cwd);
243 if resolved.starts_with(&canonical_cwd) {
244 resolved.to_string_lossy().to_string()
245 } else {
246 tracing::warn!(
247 "Symlink {} resolves outside repo root to {}, using original path",
248 path,
249 resolved.display()
250 );
251 path.to_string()
252 }
253 } else {
254 path.to_string()
255 }
256 }
257 Err(_) => path.to_string(),
258 }
259 }
260 _ => path.to_string(),
261 }
262}
263
264#[cfg(test)]
265mod tests {
266 use super::*;
267
268 #[test]
269 fn test_validate_diff_path_normal() {
270 assert_eq!(
271 validate_diff_path("src/main.rs"),
272 Some("src/main.rs".to_string())
273 );
274 }
275
276 #[test]
277 fn test_validate_diff_path_traversal_rejected() {
278 assert_eq!(validate_diff_path("../../../etc/passwd"), None);
279 }
280
281 #[test]
282 fn test_validate_diff_path_embedded_traversal_rejected() {
283 assert_eq!(validate_diff_path("src/../lib.rs"), None);
284 }
285
286 #[test]
287 fn test_validate_diff_path_absolute_rejected() {
288 assert_eq!(validate_diff_path("/etc/passwd"), None);
289 }
290
291 #[test]
292 fn test_validate_diff_path_normal_nested() {
293 assert_eq!(
294 validate_diff_path("normal/path/file.rs"),
295 Some("normal/path/file.rs".to_string())
296 );
297 }
298
299 #[test]
300 fn test_validate_diff_path_strips_prefix() {
301 assert_eq!(
302 validate_diff_path("b/src/main.rs"),
303 Some("src/main.rs".to_string())
304 );
305 assert_eq!(
306 validate_diff_path("a/src/main.rs"),
307 Some("src/main.rs".to_string())
308 );
309 }
310
311 #[test]
312 fn test_validate_diff_path_null_byte_rejected() {
313 assert_eq!(validate_diff_path("src/\0evil.rs"), None);
314 }
315
316 #[test]
317 fn test_extract_binary_path_with_traversal_returns_none() {
318 let line = "Binary files a/normal.png and b/../../../etc/shadow differ";
319 assert_eq!(extract_binary_path(line), None);
320 }
321
322 #[test]
323 fn test_extract_binary_path_valid() {
324 let line = "Binary files a/icon.png and b/icon.png differ";
325 assert_eq!(extract_binary_path(line), Some("icon.png".to_string()));
326 }
327
328 #[test]
329 fn test_parse_with_traversal_path_skipped() {
330 let raw = "diff --git a/../../../etc/passwd b/../../../etc/passwd\n\
332 --- a/../../../etc/passwd\n\
333 +++ b/../../../etc/passwd\n\
334 @@ -0,0 +1 @@\n\
335 +malicious content\n";
336 let result = parse(raw);
337 assert!(
339 result.files.iter().all(|f| !f.target_file.contains("..")),
340 "Traversal paths should be rejected"
341 );
342 }
343
344 #[test]
345 fn test_resolve_if_symlink_nonexistent() {
346 let result = resolve_if_symlink("nonexistent/path/file.rs");
348 assert_eq!(result, "nonexistent/path/file.rs");
349 }
350}