grit_lib/patch_ids.rs
1//! Patch-ID computation for commit equivalence detection.
2//!
3//! A patch-ID is a SHA-1 digest of the normalised diff a commit introduces.
4//! Whitespace is stripped from every changed line before hashing, so two
5//! commits whose diffs differ only in whitespace (spaces, tabs, newlines)
6//! produce identical patch-IDs. This is the semantics required by
7//! `git cherry` and `git format-patch --ignore-if-in-upstream`.
8//!
9//! Two complementary entry points are provided:
10//!
11//! - [`compute_patch_id`] operates on a repository's object database, computing
12//! the diff from the commit's tree objects.
13//! - [`compute_patch_ids_from_text`] parses unified diff text from stdin (e.g.
14//! output of `git log -p` or `git diff-tree --patch --stdin`), matching the
15//! behaviour of `git patch-id`.
16
17use sha1::{Digest, Sha1};
18use similar::{ChangeTag, TextDiff};
19
20use crate::diff::{diff_trees, zero_oid};
21use crate::error::Result;
22use crate::objects::{parse_commit, ObjectId, ObjectKind};
23use crate::odb::Odb;
24
25/// How to compute a patch-ID from unified diff text.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum PatchIdMode {
28 /// Accumulate file diffs into one rolling SHA-1; file order affects the
29 /// result. Compatible with Git 1.9 and older. This is the default.
30 Unstable,
31 /// Hash each file's diff independently and accumulate results with a
32 /// carry-addition, so file order does not affect the patch-ID.
33 Stable,
34 /// Like [`PatchIdMode::Stable`] but whitespace is not stripped before
35 /// hashing.
36 Verbatim,
37}
38
39/// Compute patch-IDs from unified diff text (e.g. `git log -p` output).
40///
41/// Parses the text line by line. Each time a `commit <oid>` or
42/// `From <oid> …` marker is found the accumulated hash for the previous patch
43/// is finalised and emitted as `(patch_id, commit_id)`. Commits with no diff
44/// content (empty patches, merge commits) are silently skipped.
45///
46/// # Parameters
47///
48/// - `input` — raw bytes of the unified diff stream.
49/// - `mode` — which patch-ID algorithm to use (see [`PatchIdMode`]).
50///
51/// # Returns
52///
53/// A `Vec` of `(patch_id, commit_id)` pairs in the order they were encountered
54/// in the stream.
55pub fn compute_patch_ids_from_text(input: &[u8], mode: PatchIdMode) -> Vec<(ObjectId, ObjectId)> {
56 let stable = mode != PatchIdMode::Unstable;
57 let verbatim = mode == PatchIdMode::Verbatim;
58
59 let mut results: Vec<(ObjectId, ObjectId)> = Vec::new();
60
61 // Current accumulated state for the patch being processed.
62 let mut ctx = Sha1::new();
63 let mut result = [0u8; 20];
64 let mut patchlen: usize = 0;
65 // before/after: -1 = parsing file header, 0 = awaiting @@ hunk, >0 = in hunk
66 let mut before: i32 = -1;
67 let mut after: i32 = -1;
68 let mut diff_is_binary = false;
69 let mut pre_oid_str = String::new();
70 let mut post_oid_str = String::new();
71 let mut current_commit: Option<ObjectId> = None;
72
73 // Iterate lines; we keep the trailing '\n' so remove_space mirrors git's
74 // behaviour (newline counts as whitespace and is stripped).
75 let lines = split_lines_with_nl(input);
76
77 let mut i = 0;
78 while i < lines.len() {
79 let raw = lines[i];
80 i += 1;
81
82 // The line as a &str for prefix checks; non-UTF-8 treated as empty.
83 let line = std::str::from_utf8(raw).unwrap_or("");
84
85 // Try to strip "commit " or "From " prefix to reach a potential OID.
86 let oid_candidate: Option<&str> = if let Some(rest) = line.strip_prefix("commit ") {
87 Some(rest)
88 } else if let Some(rest) = line.strip_prefix("From ") {
89 Some(rest)
90 } else {
91 None
92 };
93
94 if let Some(candidate) = oid_candidate {
95 if let Some(oid) = try_parse_oid_prefix(candidate) {
96 // Finalise the patch we've been accumulating.
97 text_flush_one_hunk(&mut result, &mut ctx);
98 if patchlen > 0 {
99 if let Some(coid) = current_commit.take() {
100 if let Ok(pid) = ObjectId::from_bytes(&result) {
101 results.push((pid, coid));
102 }
103 }
104 }
105 // Reset for the new patch.
106 result = [0u8; 20];
107 ctx = Sha1::new();
108 patchlen = 0;
109 before = -1;
110 after = -1;
111 diff_is_binary = false;
112 pre_oid_str.clear();
113 post_oid_str.clear();
114 current_commit = Some(oid);
115 continue;
116 }
117 }
118
119 // "\ No newline at end of file" markers.
120 if line.starts_with("\\ ") && line.len() > 12 {
121 if verbatim {
122 ctx.update(raw);
123 patchlen += raw.len();
124 }
125 continue;
126 }
127
128 // Skip commit metadata before the first diff hunk.
129 if patchlen == 0 && !line.starts_with("diff ") {
130 continue;
131 }
132
133 // Parsing the per-file header (before == -1).
134 if before == -1 {
135 if line.starts_with("GIT binary patch") || line.starts_with("Binary files") {
136 // Binary: only hash the blob OID strings.
137 diff_is_binary = true;
138 before = 0;
139 let pre = pre_oid_str.clone();
140 let post = post_oid_str.clone();
141 ctx.update(pre.as_bytes());
142 ctx.update(post.as_bytes());
143 patchlen += pre.len() + post.len();
144 if stable {
145 text_flush_one_hunk(&mut result, &mut ctx);
146 }
147 continue;
148 } else if let Some(rest) = line.strip_prefix("index ") {
149 // index <pre>..<post>[ <mode>]
150 if let Some(dd) = rest.find("..") {
151 pre_oid_str = rest[..dd].to_owned();
152 let tail = &rest[dd + 2..];
153 let end = tail
154 .find(|c: char| c.is_ascii_whitespace())
155 .unwrap_or_else(|| {
156 tail.trim_end_matches('\n').trim_end_matches('\r').len()
157 });
158 post_oid_str = tail[..end].to_owned();
159 }
160 continue;
161 } else if line.starts_with("--- ") {
162 before = 1;
163 after = 1;
164 // Fall through to hunk-content processing below.
165 } else if !line
166 .chars()
167 .next()
168 .is_some_and(|c| c.is_ascii_alphabetic())
169 {
170 // Non-alpha first char signals end of this patch's diffs.
171 // Re-use the `continue` path; treat as patch boundary.
172 text_flush_one_hunk(&mut result, &mut ctx);
173 if patchlen > 0 {
174 if let Some(coid) = current_commit.take() {
175 if let Ok(pid) = ObjectId::from_bytes(&result) {
176 results.push((pid, coid));
177 }
178 }
179 }
180 result = [0u8; 20];
181 ctx = Sha1::new();
182 patchlen = 0;
183 before = -1;
184 after = -1;
185 diff_is_binary = false;
186 continue;
187 }
188 }
189
190 // Skip body of binary diffs; reset on the next `diff ` line.
191 if diff_is_binary {
192 if line.starts_with("diff ") {
193 diff_is_binary = false;
194 before = -1;
195 // Process this `diff ` line as a new file header.
196 i -= 1; // re-process
197 }
198 continue;
199 }
200
201 // Waiting for a hunk header or the next file.
202 if before == 0 && after == 0 {
203 if line.starts_with("@@ -") {
204 let (b, a) = scan_hunk_header(line);
205 before = b;
206 after = a;
207 continue;
208 }
209 if !line.starts_with("diff ") {
210 // End of this file's content; nothing special to do—
211 // if it's a commit boundary the loop head will handle it.
212 continue;
213 }
214 // Another file diff starts; flush per-file hash in stable mode.
215 if stable {
216 text_flush_one_hunk(&mut result, &mut ctx);
217 }
218 before = -1;
219 after = -1;
220 // Re-process this `diff ` line as a new file header.
221 i -= 1;
222 continue;
223 }
224
225 // Inside a hunk — update the line counters.
226 let first = raw.first().copied().unwrap_or(b' ');
227 if first == b'-' || first == b' ' {
228 before -= 1;
229 }
230 if first == b'+' || first == b' ' {
231 after -= 1;
232 }
233
234 // Hash the line, optionally stripping whitespace.
235 let hashed = if verbatim {
236 ctx.update(raw);
237 raw.len()
238 } else {
239 hash_without_whitespace(&mut ctx, raw)
240 };
241 patchlen += hashed;
242 }
243
244 // Flush the final patch.
245 text_flush_one_hunk(&mut result, &mut ctx);
246 if patchlen > 0 {
247 if let Some(coid) = current_commit {
248 if let Ok(pid) = ObjectId::from_bytes(&result) {
249 results.push((pid, coid));
250 }
251 }
252 }
253
254 results
255}
256
257/// Finalise `ctx`, accumulate its digest into `result` with byte-wise
258/// carry-addition (mirrors git's `flush_one_hunk`), and reset `ctx`.
259fn text_flush_one_hunk(result: &mut [u8; 20], ctx: &mut Sha1) {
260 let old = std::mem::replace(ctx, Sha1::new());
261 let hash: [u8; 20] = old.finalize().into();
262 let mut carry: u16 = 0;
263 for i in 0..20 {
264 carry = carry + result[i] as u16 + hash[i] as u16;
265 result[i] = carry as u8;
266 carry >>= 8;
267 }
268}
269
270/// Hash `raw` bytes into `ctx`, skipping ASCII whitespace.
271///
272/// Returns the number of non-whitespace bytes fed to the hasher.
273fn hash_without_whitespace(ctx: &mut Sha1, raw: &[u8]) -> usize {
274 let mut count = 0;
275 for &b in raw {
276 if !b.is_ascii_whitespace() {
277 ctx.update([b]);
278 count += 1;
279 }
280 }
281 count
282}
283
284/// Parse a 40-hex OID at the start of `s`.
285///
286/// Returns `None` if `s` does not start with exactly 40 lowercase hex digits
287/// optionally followed by ASCII whitespace.
288fn try_parse_oid_prefix(s: &str) -> Option<ObjectId> {
289 let s = s.trim_end_matches('\n').trim_end_matches('\r');
290 if s.len() < 40 {
291 return None;
292 }
293 let hex = &s[..40];
294 if !hex.bytes().all(|b| b.is_ascii_hexdigit()) {
295 return None;
296 }
297 // The character after the OID (if any) must be whitespace or end of string.
298 if s.len() > 40 && !s.as_bytes()[40].is_ascii_whitespace() {
299 return None;
300 }
301 let mut bytes = [0u8; 20];
302 for (i, chunk) in hex.as_bytes().chunks(2).enumerate() {
303 let hi = hex_val(chunk[0])?;
304 let lo = hex_val(chunk[1])?;
305 bytes[i] = (hi << 4) | lo;
306 }
307 ObjectId::from_bytes(&bytes).ok()
308}
309
310/// Convert a single ASCII hex digit to its value.
311fn hex_val(b: u8) -> Option<u8> {
312 match b {
313 b'0'..=b'9' => Some(b - b'0'),
314 b'a'..=b'f' => Some(b - b'a' + 10),
315 b'A'..=b'F' => Some(b - b'A' + 10),
316 _ => None,
317 }
318}
319
320/// Parse the before/after line counts from a `@@ -<old>[,<n>] +<new>[,<m>] @@` header.
321///
322/// Returns `(before, after)` counts, defaulting to 1 when the count is absent.
323fn scan_hunk_header(line: &str) -> (i32, i32) {
324 // line starts with "@@ -"
325 let rest = match line.strip_prefix("@@ -") {
326 Some(r) => r,
327 None => return (1, 1),
328 };
329 // Parse old count: skip start line number, grab optional comma + count.
330 let before = parse_hunk_count(rest);
331 // Find " +" separator.
332 let after = rest
333 .find(" +")
334 .and_then(|p| parse_hunk_count_opt(&rest[p + 2..]))
335 .unwrap_or(1);
336 (before, after)
337}
338
339/// Parse `<start>[,<count>]` and return `count` (or 1 if absent).
340fn parse_hunk_count(s: &str) -> i32 {
341 // Skip digits of the start line number.
342 let after_start = s.trim_start_matches(|c: char| c.is_ascii_digit());
343 if let Some(rest) = after_start.strip_prefix(',') {
344 rest.split(|c: char| !c.is_ascii_digit())
345 .next()
346 .and_then(|n| n.parse().ok())
347 .unwrap_or(1)
348 } else {
349 1
350 }
351}
352
353/// Same as [`parse_hunk_count`] but returns `Option`.
354fn parse_hunk_count_opt(s: &str) -> Option<i32> {
355 Some(parse_hunk_count(s))
356}
357
358/// Split `input` into lines, preserving the trailing `\n` on each line.
359///
360/// The final slice may lack a trailing newline if the input doesn't end with
361/// one.
362fn split_lines_with_nl(input: &[u8]) -> Vec<&[u8]> {
363 let mut lines = Vec::new();
364 let mut start = 0;
365 for (i, &b) in input.iter().enumerate() {
366 if b == b'\n' {
367 lines.push(&input[start..=i]);
368 start = i + 1;
369 }
370 }
371 if start < input.len() {
372 lines.push(&input[start..]);
373 }
374 lines
375}
376
377/// Compute the patch-ID for a single commit.
378///
379/// Returns `None` for merge commits (more than one parent), since those do not
380/// have a well-defined single-parent diff. Root commits (no parents) are
381/// compared against the empty tree.
382///
383/// # Parameters
384///
385/// - `odb` — object database used to read commit, tree, and blob objects.
386/// - `commit_oid` — OID of the commit to compute the patch-ID for.
387///
388/// # Errors
389///
390/// Returns errors from object-database reads or object-parse failures.
391pub fn compute_patch_id(odb: &Odb, commit_oid: &ObjectId) -> Result<Option<ObjectId>> {
392 let obj = odb.read(commit_oid)?;
393 if obj.kind != ObjectKind::Commit {
394 return Ok(None);
395 }
396 let commit = parse_commit(&obj.data)?;
397
398 // Merge commits (>1 parent) have no defined patch-id.
399 if commit.parents.len() > 1 {
400 return Ok(None);
401 }
402
403 // Resolve the parent tree (None = empty tree for root commits).
404 let parent_tree_oid = if commit.parents.is_empty() {
405 None
406 } else {
407 let parent_obj = odb.read(&commit.parents[0])?;
408 let parent_commit = parse_commit(&parent_obj.data)?;
409 Some(parent_commit.tree)
410 };
411
412 // Compute tree-to-tree diff.
413 let mut diffs = diff_trees(odb, parent_tree_oid.as_ref(), Some(&commit.tree), "")?;
414
415 // Sort by primary path (lexicographic), matching diffcore_std ordering.
416 diffs.sort_by(|a, b| a.path().cmp(b.path()));
417
418 let mut hasher = Sha1::new();
419
420 for entry in &diffs {
421 // Determine src and dst path strings (both sides always carry the
422 // logical path, even for pure adds/deletes).
423 let src = entry
424 .old_path
425 .as_deref()
426 .or(entry.new_path.as_deref())
427 .unwrap_or("");
428 let dst = entry
429 .new_path
430 .as_deref()
431 .or(entry.old_path.as_deref())
432 .unwrap_or("");
433
434 // Compact paths: remove all ASCII-whitespace (mirrors git's remove_space).
435 let src_compact = compact_path(src);
436 let dst_compact = compact_path(dst);
437
438 // Hash the diff header line.
439 let header = format!("diff --git a/{src_compact} b/{dst_compact}\n");
440 hasher.update(header.as_bytes());
441
442 // Read blob content for the old and new sides.
443 let old_bytes = read_blob(odb, &entry.old_oid)?;
444 let new_bytes = read_blob(odb, &entry.new_oid)?;
445
446 // Convert to str slices for the line-diff algorithm; treat non-UTF-8
447 // content as empty (binary blobs only contribute their header).
448 let old_str = std::str::from_utf8(&old_bytes).unwrap_or("");
449 let new_str = std::str::from_utf8(&new_bytes).unwrap_or("");
450
451 // Hash non-whitespace bytes from every added and deleted line.
452 let diff = TextDiff::from_lines(old_str, new_str);
453 for change in diff.iter_all_changes() {
454 match change.tag() {
455 ChangeTag::Delete | ChangeTag::Insert => {
456 let line = change.as_str().unwrap_or("");
457 for &byte in line.as_bytes() {
458 if !byte.is_ascii_whitespace() {
459 hasher.update([byte]);
460 }
461 }
462 }
463 ChangeTag::Equal => {}
464 }
465 }
466 }
467
468 let digest = hasher.finalize();
469 ObjectId::from_bytes(&digest).map(Some)
470}
471
472/// Remove all ASCII-whitespace characters from a path string.
473///
474/// Mirrors git's `remove_space()` used when hashing diff headers.
475fn compact_path(path: &str) -> String {
476 path.bytes()
477 .filter(|b| !b.is_ascii_whitespace())
478 .map(|b| b as char)
479 .collect()
480}
481
482/// Read a blob's raw bytes from the ODB.
483///
484/// Returns an empty `Vec` for the zero OID (representing an absent file).
485fn read_blob(odb: &Odb, oid: &ObjectId) -> Result<Vec<u8>> {
486 if *oid == zero_oid() {
487 return Ok(Vec::new());
488 }
489 let obj = odb.read(oid)?;
490 Ok(obj.data)
491}