grit_lib/patch_ids.rs
1//! Patch-ID computation for commit equivalence detection.
2//!
3//! A patch-ID is a SHA-1 digest of the normalised diff a commit introduces.
4//! Whitespace is stripped from every changed line before hashing, so two
5//! commits whose diffs differ only in whitespace (spaces, tabs, newlines)
6//! produce identical patch-IDs. This is the semantics required by
7//! `git cherry` and `git format-patch --ignore-if-in-upstream`.
8//!
9//! Two complementary entry points are provided:
10//!
11//! - [`compute_patch_id`] operates on a repository's object database, computing
12//! the diff from the commit's tree objects.
13//! - [`compute_patch_ids_from_text`] parses unified diff text from stdin (e.g.
14//! output of `git log -p` or `git diff-tree --patch --stdin`), matching the
15//! behaviour of `git patch-id`.
16
17use sha1::{Digest, Sha1};
18use similar::{ChangeTag, TextDiff};
19
20use crate::diff::{diff_trees, zero_oid};
21use crate::error::Result;
22use crate::objects::{parse_commit, ObjectId, ObjectKind};
23use crate::odb::Odb;
24
25/// How to compute a patch-ID from unified diff text.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum PatchIdMode {
28 /// Accumulate file diffs into one rolling SHA-1; file order affects the
29 /// result. Compatible with Git 1.9 and older. This is the default.
30 Unstable,
31 /// Hash each file's diff independently and accumulate results with a
32 /// carry-addition, so file order does not affect the patch-ID.
33 Stable,
34 /// Like [`PatchIdMode::Stable`] but whitespace is not stripped before
35 /// hashing.
36 Verbatim,
37}
38
39/// Compute patch-IDs from unified diff text (e.g. `git log -p` output).
40///
41/// Parses the text line by line. Each time a `commit <oid>` or
42/// `From <oid> …` marker is found the accumulated hash for the previous patch
43/// is finalised and emitted as `(patch_id, commit_id)`. Commits with no diff
44/// content (empty patches, merge commits) are silently skipped.
45///
46/// # Parameters
47///
48/// - `input` — raw bytes of the unified diff stream.
49/// - `mode` — which patch-ID algorithm to use (see [`PatchIdMode`]).
50///
51/// # Returns
52///
53/// A `Vec` of `(patch_id, commit_id)` pairs in the order they were encountered
54/// in the stream.
55pub fn compute_patch_ids_from_text(input: &[u8], mode: PatchIdMode) -> Vec<(ObjectId, ObjectId)> {
56 let stable = mode != PatchIdMode::Unstable;
57 let verbatim = mode == PatchIdMode::Verbatim;
58
59 let mut results: Vec<(ObjectId, ObjectId)> = Vec::new();
60
61 // Current accumulated state for the patch being processed.
62 let mut ctx = Sha1::new();
63 let mut result = [0u8; 20];
64 let mut patchlen: usize = 0;
65 // before/after: -1 = parsing file header, 0 = awaiting @@ hunk, >0 = in hunk
66 let mut before: i32 = -1;
67 let mut after: i32 = -1;
68 let mut diff_is_binary = false;
69 let mut pre_oid_str = String::new();
70 let mut post_oid_str = String::new();
71 let mut current_commit: Option<ObjectId> = None;
72 // When the input starts directly with a diff (no commit header),
73 // use the zero OID as the commit ID.
74 let mut implicit_commit = true;
75
76 // Iterate lines; we keep the trailing '\n' so remove_space mirrors git's
77 // behaviour (newline counts as whitespace and is stripped).
78 let lines = split_lines_with_nl(input);
79
80 let mut i = 0;
81 while i < lines.len() {
82 let raw = lines[i];
83 i += 1;
84
85 // The line as a &str for prefix checks; non-UTF-8 treated as empty.
86 let line = std::str::from_utf8(raw).unwrap_or("");
87
88 // Try to strip "commit " or "From " prefix to reach a potential OID.
89 let oid_candidate: Option<&str> = if let Some(rest) = line.strip_prefix("commit ") {
90 Some(rest)
91 } else if let Some(rest) = line.strip_prefix("From ") {
92 Some(rest)
93 } else {
94 None
95 };
96
97 if let Some(candidate) = oid_candidate {
98 if let Some(oid) = try_parse_oid_prefix(candidate) {
99 // Finalise the patch we've been accumulating.
100 text_flush_one_hunk(&mut result, &mut ctx);
101 if patchlen > 0 {
102 if let Some(coid) = current_commit.take() {
103 if let Ok(pid) = ObjectId::from_bytes(&result) {
104 results.push((pid, coid));
105 }
106 }
107 }
108 // Reset for the new patch.
109 result = [0u8; 20];
110 ctx = Sha1::new();
111 patchlen = 0;
112 before = -1;
113 after = -1;
114 diff_is_binary = false;
115 pre_oid_str.clear();
116 post_oid_str.clear();
117 current_commit = Some(oid);
118 implicit_commit = false;
119 continue;
120 }
121 }
122
123 // "\ No newline at end of file" markers.
124 if line.starts_with("\\ ") && line.len() > 12 {
125 if verbatim {
126 ctx.update(raw);
127 patchlen += raw.len();
128 }
129 continue;
130 }
131
132 // Skip commit metadata before the first diff hunk.
133 if patchlen == 0 && !line.starts_with("diff ") {
134 continue;
135 }
136
137 // If we see a diff line without a preceding commit marker,
138 // treat it as a single patch with zero commit OID.
139 if implicit_commit && line.starts_with("diff ") && current_commit.is_none() {
140 current_commit = Some(ObjectId::from_bytes(&[0u8; 20]).unwrap());
141 implicit_commit = false;
142 }
143
144 // Parsing the per-file header (before == -1).
145 if before == -1 {
146 if line.starts_with("GIT binary patch") || line.starts_with("Binary files") {
147 // Binary: only hash the blob OID strings.
148 diff_is_binary = true;
149 before = 0;
150 let pre = pre_oid_str.clone();
151 let post = post_oid_str.clone();
152 ctx.update(pre.as_bytes());
153 ctx.update(post.as_bytes());
154 patchlen += pre.len() + post.len();
155 if stable {
156 text_flush_one_hunk(&mut result, &mut ctx);
157 }
158 continue;
159 } else if let Some(rest) = line.strip_prefix("index ") {
160 // index <pre>..<post>[ <mode>]
161 if let Some(dd) = rest.find("..") {
162 pre_oid_str = rest[..dd].to_owned();
163 let tail = &rest[dd + 2..];
164 let end = tail
165 .find(|c: char| c.is_ascii_whitespace())
166 .unwrap_or_else(|| {
167 tail.trim_end_matches('\n').trim_end_matches('\r').len()
168 });
169 post_oid_str = tail[..end].to_owned();
170 }
171 continue;
172 } else if line.starts_with("--- ") {
173 before = 1;
174 after = 1;
175 // Fall through to hunk-content processing below.
176 } else if !line.chars().next().is_some_and(|c| c.is_ascii_alphabetic()) {
177 // Non-alpha first char signals end of this patch's diffs.
178 // Re-use the `continue` path; treat as patch boundary.
179 text_flush_one_hunk(&mut result, &mut ctx);
180 if patchlen > 0 {
181 if let Some(coid) = current_commit.take() {
182 if let Ok(pid) = ObjectId::from_bytes(&result) {
183 results.push((pid, coid));
184 }
185 }
186 }
187 result = [0u8; 20];
188 ctx = Sha1::new();
189 patchlen = 0;
190 before = -1;
191 after = -1;
192 diff_is_binary = false;
193 continue;
194 }
195 }
196
197 // Skip body of binary diffs; reset on the next `diff ` line.
198 if diff_is_binary {
199 if line.starts_with("diff ") {
200 diff_is_binary = false;
201 before = -1;
202 // Process this `diff ` line as a new file header.
203 i -= 1; // re-process
204 }
205 continue;
206 }
207
208 // Waiting for a hunk header or the next file.
209 if before == 0 && after == 0 {
210 if line.starts_with("@@ -") {
211 let (b, a) = scan_hunk_header(line);
212 before = b;
213 after = a;
214 continue;
215 }
216 if !line.starts_with("diff ") {
217 // End of this file's content; nothing special to do—
218 // if it's a commit boundary the loop head will handle it.
219 continue;
220 }
221 // Another file diff starts; flush per-file hash in stable mode.
222 if stable {
223 text_flush_one_hunk(&mut result, &mut ctx);
224 }
225 before = -1;
226 after = -1;
227 // Re-process this `diff ` line as a new file header.
228 i -= 1;
229 continue;
230 }
231
232 // Inside a hunk — update the line counters.
233 let first = raw.first().copied().unwrap_or(b' ');
234 if first == b'-' || first == b' ' {
235 before -= 1;
236 }
237 if first == b'+' || first == b' ' {
238 after -= 1;
239 }
240
241 // Hash the line, optionally stripping whitespace.
242 let hashed = if verbatim {
243 ctx.update(raw);
244 raw.len()
245 } else {
246 hash_without_whitespace(&mut ctx, raw)
247 };
248 patchlen += hashed;
249 }
250
251 // Flush the final patch.
252 text_flush_one_hunk(&mut result, &mut ctx);
253 if patchlen > 0 {
254 if let Some(coid) = current_commit {
255 if let Ok(pid) = ObjectId::from_bytes(&result) {
256 results.push((pid, coid));
257 }
258 }
259 }
260
261 results
262}
263
264/// Finalise `ctx`, accumulate its digest into `result` with byte-wise
265/// carry-addition (mirrors git's `flush_one_hunk`), and reset `ctx`.
266fn text_flush_one_hunk(result: &mut [u8; 20], ctx: &mut Sha1) {
267 let old = std::mem::replace(ctx, Sha1::new());
268 let hash: [u8; 20] = old.finalize().into();
269 let mut carry: u16 = 0;
270 for i in 0..20 {
271 carry = carry + result[i] as u16 + hash[i] as u16;
272 result[i] = carry as u8;
273 carry >>= 8;
274 }
275}
276
277/// Hash `raw` bytes into `ctx`, skipping ASCII whitespace.
278///
279/// Returns the number of non-whitespace bytes fed to the hasher.
280fn hash_without_whitespace(ctx: &mut Sha1, raw: &[u8]) -> usize {
281 let mut count = 0;
282 for &b in raw {
283 if !b.is_ascii_whitespace() {
284 ctx.update([b]);
285 count += 1;
286 }
287 }
288 count
289}
290
291/// Parse a 40-hex OID at the start of `s`.
292///
293/// Returns `None` if `s` does not start with exactly 40 lowercase hex digits
294/// optionally followed by ASCII whitespace.
295fn try_parse_oid_prefix(s: &str) -> Option<ObjectId> {
296 let s = s.trim_end_matches('\n').trim_end_matches('\r');
297 if s.len() < 40 {
298 return None;
299 }
300 let hex = &s[..40];
301 if !hex.bytes().all(|b| b.is_ascii_hexdigit()) {
302 return None;
303 }
304 // The character after the OID (if any) must be whitespace or end of string.
305 if s.len() > 40 && !s.as_bytes()[40].is_ascii_whitespace() {
306 return None;
307 }
308 let mut bytes = [0u8; 20];
309 for (i, chunk) in hex.as_bytes().chunks(2).enumerate() {
310 let hi = hex_val(chunk[0])?;
311 let lo = hex_val(chunk[1])?;
312 bytes[i] = (hi << 4) | lo;
313 }
314 ObjectId::from_bytes(&bytes).ok()
315}
316
317/// Convert a single ASCII hex digit to its value.
318fn hex_val(b: u8) -> Option<u8> {
319 match b {
320 b'0'..=b'9' => Some(b - b'0'),
321 b'a'..=b'f' => Some(b - b'a' + 10),
322 b'A'..=b'F' => Some(b - b'A' + 10),
323 _ => None,
324 }
325}
326
327/// Parse the before/after line counts from a `@@ -<old>[,<n>] +<new>[,<m>] @@` header.
328///
329/// Returns `(before, after)` counts, defaulting to 1 when the count is absent.
330fn scan_hunk_header(line: &str) -> (i32, i32) {
331 // line starts with "@@ -"
332 let rest = match line.strip_prefix("@@ -") {
333 Some(r) => r,
334 None => return (1, 1),
335 };
336 // Parse old count: skip start line number, grab optional comma + count.
337 let before = parse_hunk_count(rest);
338 // Find " +" separator.
339 let after = rest
340 .find(" +")
341 .and_then(|p| parse_hunk_count_opt(&rest[p + 2..]))
342 .unwrap_or(1);
343 (before, after)
344}
345
346/// Parse `<start>[,<count>]` and return `count` (or 1 if absent).
347fn parse_hunk_count(s: &str) -> i32 {
348 // Skip digits of the start line number.
349 let after_start = s.trim_start_matches(|c: char| c.is_ascii_digit());
350 if let Some(rest) = after_start.strip_prefix(',') {
351 rest.split(|c: char| !c.is_ascii_digit())
352 .next()
353 .and_then(|n| n.parse().ok())
354 .unwrap_or(1)
355 } else {
356 1
357 }
358}
359
360/// Same as [`parse_hunk_count`] but returns `Option`.
361fn parse_hunk_count_opt(s: &str) -> Option<i32> {
362 Some(parse_hunk_count(s))
363}
364
365/// Split `input` into lines, preserving the trailing `\n` on each line.
366///
367/// The final slice may lack a trailing newline if the input doesn't end with
368/// one.
369fn split_lines_with_nl(input: &[u8]) -> Vec<&[u8]> {
370 let mut lines = Vec::new();
371 let mut start = 0;
372 for (i, &b) in input.iter().enumerate() {
373 if b == b'\n' {
374 lines.push(&input[start..=i]);
375 start = i + 1;
376 }
377 }
378 if start < input.len() {
379 lines.push(&input[start..]);
380 }
381 lines
382}
383
384/// Compute the patch-ID for a single commit.
385///
386/// Returns `None` for merge commits (more than one parent), since those do not
387/// have a well-defined single-parent diff. Root commits (no parents) are
388/// compared against the empty tree.
389///
390/// # Parameters
391///
392/// - `odb` — object database used to read commit, tree, and blob objects.
393/// - `commit_oid` — OID of the commit to compute the patch-ID for.
394///
395/// # Errors
396///
397/// Returns errors from object-database reads or object-parse failures.
398pub fn compute_patch_id(odb: &Odb, commit_oid: &ObjectId) -> Result<Option<ObjectId>> {
399 let obj = odb.read(commit_oid)?;
400 if obj.kind != ObjectKind::Commit {
401 return Ok(None);
402 }
403 let commit = parse_commit(&obj.data)?;
404
405 // Merge commits (>1 parent) have no defined patch-id.
406 if commit.parents.len() > 1 {
407 return Ok(None);
408 }
409
410 // Resolve the parent tree (None = empty tree for root commits).
411 let parent_tree_oid = if commit.parents.is_empty() {
412 None
413 } else {
414 let parent_obj = odb.read(&commit.parents[0])?;
415 let parent_commit = parse_commit(&parent_obj.data)?;
416 Some(parent_commit.tree)
417 };
418
419 // Compute tree-to-tree diff.
420 let mut diffs = diff_trees(odb, parent_tree_oid.as_ref(), Some(&commit.tree), "")?;
421
422 // Sort by primary path (lexicographic), matching diffcore_std ordering.
423 diffs.sort_by(|a, b| a.path().cmp(b.path()));
424
425 let mut hasher = Sha1::new();
426
427 for entry in &diffs {
428 // Determine src and dst path strings (both sides always carry the
429 // logical path, even for pure adds/deletes).
430 let src = entry
431 .old_path
432 .as_deref()
433 .or(entry.new_path.as_deref())
434 .unwrap_or("");
435 let dst = entry
436 .new_path
437 .as_deref()
438 .or(entry.old_path.as_deref())
439 .unwrap_or("");
440
441 // Compact paths: remove all ASCII-whitespace (mirrors git's remove_space).
442 let src_compact = compact_path(src);
443 let dst_compact = compact_path(dst);
444
445 // Hash the diff header line.
446 let header = format!("diff --git a/{src_compact} b/{dst_compact}\n");
447 hasher.update(header.as_bytes());
448
449 // Read blob content for the old and new sides.
450 let old_bytes = read_blob(odb, &entry.old_oid)?;
451 let new_bytes = read_blob(odb, &entry.new_oid)?;
452
453 // Convert to str slices for the line-diff algorithm; treat non-UTF-8
454 // content as empty (binary blobs only contribute their header).
455 let old_str = std::str::from_utf8(&old_bytes).unwrap_or("");
456 let new_str = std::str::from_utf8(&new_bytes).unwrap_or("");
457
458 // Hash non-whitespace bytes from every added and deleted line.
459 let diff = TextDiff::from_lines(old_str, new_str);
460 for change in diff.iter_all_changes() {
461 match change.tag() {
462 ChangeTag::Delete | ChangeTag::Insert => {
463 let line = change.as_str().unwrap_or("");
464 for &byte in line.as_bytes() {
465 if !byte.is_ascii_whitespace() {
466 hasher.update([byte]);
467 }
468 }
469 }
470 ChangeTag::Equal => {}
471 }
472 }
473 }
474
475 let digest = hasher.finalize();
476 ObjectId::from_bytes(&digest).map(Some)
477}
478
479/// Remove all ASCII-whitespace characters from a path string.
480///
481/// Mirrors git's `remove_space()` used when hashing diff headers.
482fn compact_path(path: &str) -> String {
483 path.bytes()
484 .filter(|b| !b.is_ascii_whitespace())
485 .map(|b| b as char)
486 .collect()
487}
488
489/// Read a blob's raw bytes from the ODB.
490///
491/// Returns an empty `Vec` for the zero OID (representing an absent file).
492fn read_blob(odb: &Odb, oid: &ObjectId) -> Result<Vec<u8>> {
493 if *oid == zero_oid() {
494 return Ok(Vec::new());
495 }
496 let obj = odb.read(oid)?;
497 Ok(obj.data)
498}