Skip to main content

grit_lib/
patch_ids.rs

1//! Patch-ID computation for commit equivalence detection.
2//!
3//! A patch-ID is a SHA-1 digest of the normalised diff a commit introduces.
4//! Whitespace is stripped from every changed line before hashing, so two
5//! commits whose diffs differ only in whitespace (spaces, tabs, newlines)
6//! produce identical patch-IDs.  This is the semantics required by
7//! `git cherry` and `git format-patch --ignore-if-in-upstream`.
8//!
9//! Two complementary entry points are provided:
10//!
11//! - [`compute_patch_id`] operates on a repository's object database, computing
12//!   the diff from the commit's tree objects.
13//! - [`compute_patch_ids_from_text`] parses unified diff text from stdin (e.g.
14//!   output of `git log -p` or `git diff-tree --patch --stdin`), matching the
15//!   behaviour of `git patch-id`.
16
17use sha1::{Digest, Sha1};
18use similar::{ChangeTag, TextDiff};
19
20use crate::diff::{diff_trees, zero_oid};
21use crate::error::Result;
22use crate::merge_file;
23use crate::objects::{parse_commit, ObjectId, ObjectKind};
24use crate::odb::Odb;
25
26/// How to compute a patch-ID from unified diff text.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum PatchIdMode {
29    /// Accumulate file diffs into one rolling SHA-1; file order affects the
30    /// result.  Compatible with Git 1.9 and older.  This is the default.
31    Unstable,
32    /// Hash each file's diff independently and accumulate results with a
33    /// carry-addition, so file order does not affect the patch-ID.
34    Stable,
35    /// Like [`PatchIdMode::Stable`] but whitespace is not stripped before
36    /// hashing.
37    Verbatim,
38}
39
40/// Compute patch-IDs from unified diff text (e.g. `git log -p` output).
41///
42/// Parses the text line by line.  Each time a `commit <oid>` or
43/// `From <oid> …` marker is found the accumulated hash for the previous patch
44/// is finalised and emitted as `(patch_id, commit_id)`.  Commits with no diff
45/// content (empty patches, merge commits) are silently skipped.
46///
47/// # Parameters
48///
49/// - `input` — raw bytes of the unified diff stream.
50/// - `mode`  — which patch-ID algorithm to use (see [`PatchIdMode`]).
51///
52/// # Returns
53///
54/// A `Vec` of `(patch_id, commit_id)` pairs in the order they were encountered
55/// in the stream.
56pub fn compute_patch_ids_from_text(input: &[u8], mode: PatchIdMode) -> Vec<(ObjectId, ObjectId)> {
57    let stable = mode != PatchIdMode::Unstable;
58    let verbatim = mode == PatchIdMode::Verbatim;
59
60    let mut results: Vec<(ObjectId, ObjectId)> = Vec::new();
61
62    // Current accumulated state for the patch being processed.
63    let mut ctx = Sha1::new();
64    let mut result = [0u8; 20];
65    let mut patchlen: usize = 0;
66    // before/after: -1 = parsing file header, 0 = awaiting @@ hunk, >0 = in hunk
67    let mut before: i32 = -1;
68    let mut after: i32 = -1;
69    let mut diff_is_binary = false;
70    let mut pre_oid_str = String::new();
71    let mut post_oid_str = String::new();
72    let mut current_commit: Option<ObjectId> = None;
73    // When the input starts directly with a diff (no commit header),
74    // use the zero OID as the commit ID.
75    let mut implicit_commit = true;
76
77    // Iterate lines; we keep the trailing '\n' so remove_space mirrors git's
78    // behaviour (newline counts as whitespace and is stripped).
79    let lines = split_lines_with_nl(input);
80
81    let mut i = 0;
82    while i < lines.len() {
83        let raw = lines[i];
84        i += 1;
85
86        // The line as a &str for prefix checks; non-UTF-8 treated as empty.
87        let line = std::str::from_utf8(raw).unwrap_or("");
88
89        // Try to strip "commit " or "From " prefix to reach a potential OID.
90        let oid_candidate: Option<&str> = if let Some(rest) = line.strip_prefix("commit ") {
91            Some(rest)
92        } else if let Some(rest) = line.strip_prefix("From ") {
93            Some(rest)
94        } else {
95            None
96        };
97
98        if let Some(candidate) = oid_candidate {
99            if let Some(oid) = try_parse_oid_prefix(candidate) {
100                // Finalise the patch we've been accumulating.
101                text_flush_one_hunk(&mut result, &mut ctx);
102                if patchlen > 0 {
103                    if let Some(coid) = current_commit.take() {
104                        if let Ok(pid) = ObjectId::from_bytes(&result) {
105                            results.push((pid, coid));
106                        }
107                    }
108                }
109                // Reset for the new patch.
110                result = [0u8; 20];
111                ctx = Sha1::new();
112                patchlen = 0;
113                before = -1;
114                after = -1;
115                diff_is_binary = false;
116                pre_oid_str.clear();
117                post_oid_str.clear();
118                current_commit = Some(oid);
119                implicit_commit = false;
120                continue;
121            }
122        }
123
124        // "\ No newline at end of file" markers.
125        if line.starts_with("\\ ") && line.len() > 12 {
126            if verbatim {
127                ctx.update(raw);
128                patchlen += raw.len();
129            }
130            continue;
131        }
132
133        // Skip commit metadata before the first diff hunk.
134        if patchlen == 0 && !line.starts_with("diff ") {
135            continue;
136        }
137
138        // If we see a diff line without a preceding commit marker,
139        // treat it as a single patch with zero commit OID.
140        if implicit_commit && line.starts_with("diff ") && current_commit.is_none() {
141            current_commit = Some(ObjectId::from_bytes(&[0u8; 20]).unwrap());
142            implicit_commit = false;
143        }
144
145        // Parsing the per-file header (before == -1).
146        if before == -1 {
147            if line.starts_with("GIT binary patch") || line.starts_with("Binary files") {
148                // Binary: only hash the blob OID strings.
149                diff_is_binary = true;
150                before = 0;
151                let pre = pre_oid_str.clone();
152                let post = post_oid_str.clone();
153                ctx.update(pre.as_bytes());
154                ctx.update(post.as_bytes());
155                patchlen += pre.len() + post.len();
156                if stable {
157                    text_flush_one_hunk(&mut result, &mut ctx);
158                }
159                continue;
160            } else if let Some(rest) = line.strip_prefix("index ") {
161                // index <pre>..<post>[  <mode>]
162                if let Some(dd) = rest.find("..") {
163                    pre_oid_str = rest[..dd].to_owned();
164                    let tail = &rest[dd + 2..];
165                    let end = tail
166                        .find(|c: char| c.is_ascii_whitespace())
167                        .unwrap_or_else(|| {
168                            tail.trim_end_matches('\n').trim_end_matches('\r').len()
169                        });
170                    post_oid_str = tail[..end].to_owned();
171                }
172                continue;
173            } else if line.starts_with("--- ") {
174                before = 1;
175                after = 1;
176                // Fall through to hunk-content processing below.
177            } else if !line.chars().next().is_some_and(|c| c.is_ascii_alphabetic()) {
178                // Non-alpha first char signals end of this patch's diffs.
179                // Re-use the `continue` path; treat as patch boundary.
180                text_flush_one_hunk(&mut result, &mut ctx);
181                if patchlen > 0 {
182                    if let Some(coid) = current_commit.take() {
183                        if let Ok(pid) = ObjectId::from_bytes(&result) {
184                            results.push((pid, coid));
185                        }
186                    }
187                }
188                result = [0u8; 20];
189                ctx = Sha1::new();
190                patchlen = 0;
191                before = -1;
192                after = -1;
193                diff_is_binary = false;
194                continue;
195            }
196        }
197
198        // Skip body of binary diffs; reset on the next `diff ` line.
199        if diff_is_binary {
200            if line.starts_with("diff ") {
201                diff_is_binary = false;
202                before = -1;
203                // Process this `diff ` line as a new file header.
204                i -= 1; // re-process
205            }
206            continue;
207        }
208
209        // Waiting for a hunk header or the next file.
210        if before == 0 && after == 0 {
211            if line.starts_with("@@ -") {
212                let (b, a) = scan_hunk_header(line);
213                before = b;
214                after = a;
215                continue;
216            }
217            if !line.starts_with("diff ") {
218                // End of this file's content; nothing special to do—
219                // if it's a commit boundary the loop head will handle it.
220                continue;
221            }
222            // Another file diff starts; flush per-file hash in stable mode.
223            if stable {
224                text_flush_one_hunk(&mut result, &mut ctx);
225            }
226            before = -1;
227            after = -1;
228            // Re-process this `diff ` line as a new file header.
229            i -= 1;
230            continue;
231        }
232
233        // Inside a hunk — update the line counters.
234        let first = raw.first().copied().unwrap_or(b' ');
235        if first == b'-' || first == b' ' {
236            before -= 1;
237        }
238        if first == b'+' || first == b' ' {
239            after -= 1;
240        }
241
242        // Hash the line, optionally stripping whitespace.
243        let hashed = if verbatim {
244            ctx.update(raw);
245            raw.len()
246        } else {
247            hash_without_whitespace(&mut ctx, raw)
248        };
249        patchlen += hashed;
250    }
251
252    // Flush the final patch.
253    text_flush_one_hunk(&mut result, &mut ctx);
254    if patchlen > 0 {
255        if let Some(coid) = current_commit {
256            if let Ok(pid) = ObjectId::from_bytes(&result) {
257                results.push((pid, coid));
258            }
259        }
260    }
261
262    results
263}
264
265/// Finalise `ctx`, accumulate its digest into `result` with byte-wise
266/// carry-addition (mirrors git's `flush_one_hunk`), and reset `ctx`.
267fn text_flush_one_hunk(result: &mut [u8; 20], ctx: &mut Sha1) {
268    let old = std::mem::replace(ctx, Sha1::new());
269    let hash: [u8; 20] = old.finalize().into();
270    let mut carry: u16 = 0;
271    for i in 0..20 {
272        carry = carry + result[i] as u16 + hash[i] as u16;
273        result[i] = carry as u8;
274        carry >>= 8;
275    }
276}
277
278/// Hash `raw` bytes into `ctx`, skipping ASCII whitespace.
279///
280/// Returns the number of non-whitespace bytes fed to the hasher.
281fn hash_without_whitespace(ctx: &mut Sha1, raw: &[u8]) -> usize {
282    let mut count = 0;
283    for &b in raw {
284        if !b.is_ascii_whitespace() {
285            ctx.update([b]);
286            count += 1;
287        }
288    }
289    count
290}
291
292/// Parse a 40-hex OID at the start of `s`.
293///
294/// Returns `None` if `s` does not start with exactly 40 lowercase hex digits
295/// optionally followed by ASCII whitespace.
296fn try_parse_oid_prefix(s: &str) -> Option<ObjectId> {
297    let s = s.trim_end_matches('\n').trim_end_matches('\r');
298    if s.len() < 40 {
299        return None;
300    }
301    let hex = &s[..40];
302    if !hex.bytes().all(|b| b.is_ascii_hexdigit()) {
303        return None;
304    }
305    // The character after the OID (if any) must be whitespace or end of string.
306    if s.len() > 40 && !s.as_bytes()[40].is_ascii_whitespace() {
307        return None;
308    }
309    let mut bytes = [0u8; 20];
310    for (i, chunk) in hex.as_bytes().chunks(2).enumerate() {
311        let hi = hex_val(chunk[0])?;
312        let lo = hex_val(chunk[1])?;
313        bytes[i] = (hi << 4) | lo;
314    }
315    ObjectId::from_bytes(&bytes).ok()
316}
317
318/// Convert a single ASCII hex digit to its value.
319fn hex_val(b: u8) -> Option<u8> {
320    match b {
321        b'0'..=b'9' => Some(b - b'0'),
322        b'a'..=b'f' => Some(b - b'a' + 10),
323        b'A'..=b'F' => Some(b - b'A' + 10),
324        _ => None,
325    }
326}
327
328/// Parse the before/after line counts from a `@@ -<old>[,<n>] +<new>[,<m>] @@` header.
329///
330/// Returns `(before, after)` counts, defaulting to 1 when the count is absent.
331fn scan_hunk_header(line: &str) -> (i32, i32) {
332    // line starts with "@@ -"
333    let rest = match line.strip_prefix("@@ -") {
334        Some(r) => r,
335        None => return (1, 1),
336    };
337    // Parse old count: skip start line number, grab optional comma + count.
338    let before = parse_hunk_count(rest);
339    // Find " +" separator.
340    let after = rest
341        .find(" +")
342        .and_then(|p| parse_hunk_count_opt(&rest[p + 2..]))
343        .unwrap_or(1);
344    (before, after)
345}
346
347/// Parse `<start>[,<count>]` and return `count` (or 1 if absent).
348fn parse_hunk_count(s: &str) -> i32 {
349    // Skip digits of the start line number.
350    let after_start = s.trim_start_matches(|c: char| c.is_ascii_digit());
351    if let Some(rest) = after_start.strip_prefix(',') {
352        rest.split(|c: char| !c.is_ascii_digit())
353            .next()
354            .and_then(|n| n.parse().ok())
355            .unwrap_or(1)
356    } else {
357        1
358    }
359}
360
361/// Same as [`parse_hunk_count`] but returns `Option`.
362fn parse_hunk_count_opt(s: &str) -> Option<i32> {
363    Some(parse_hunk_count(s))
364}
365
366/// Split `input` into lines, preserving the trailing `\n` on each line.
367///
368/// The final slice may lack a trailing newline if the input doesn't end with
369/// one.
370fn split_lines_with_nl(input: &[u8]) -> Vec<&[u8]> {
371    let mut lines = Vec::new();
372    let mut start = 0;
373    for (i, &b) in input.iter().enumerate() {
374        if b == b'\n' {
375            lines.push(&input[start..=i]);
376            start = i + 1;
377        }
378    }
379    if start < input.len() {
380        lines.push(&input[start..]);
381    }
382    lines
383}
384
385/// Compute the patch-ID for a single commit.
386///
387/// Returns `None` for merge commits (more than one parent), since those do not
388/// have a well-defined single-parent diff.  Root commits (no parents) are
389/// compared against the empty tree.
390///
391/// # Parameters
392///
393/// - `odb` — object database used to read commit, tree, and blob objects.
394/// - `commit_oid` — OID of the commit to compute the patch-ID for.
395///
396/// # Errors
397///
398/// Returns errors from object-database reads or object-parse failures.
399pub fn compute_patch_id(odb: &Odb, commit_oid: &ObjectId) -> Result<Option<ObjectId>> {
400    let obj = odb.read(commit_oid)?;
401    if obj.kind != ObjectKind::Commit {
402        return Ok(None);
403    }
404    let commit = parse_commit(&obj.data)?;
405
406    // Merge commits (>1 parent) have no defined patch-id.
407    if commit.parents.len() > 1 {
408        return Ok(None);
409    }
410
411    // Resolve the parent tree (None = empty tree for root commits).
412    let parent_tree_oid = if commit.parents.is_empty() {
413        None
414    } else {
415        let parent_obj = odb.read(&commit.parents[0])?;
416        let parent_commit = parse_commit(&parent_obj.data)?;
417        Some(parent_commit.tree)
418    };
419
420    // Compute tree-to-tree diff.
421    let mut diffs = diff_trees(odb, parent_tree_oid.as_ref(), Some(&commit.tree), "")?;
422
423    // Sort by primary path (lexicographic), matching diffcore_std ordering.
424    diffs.sort_by(|a, b| a.path().cmp(b.path()));
425
426    let mut result = [0u8; 20];
427
428    for entry in &diffs {
429        let old_path = entry.old_path.as_deref().unwrap_or("");
430        let new_path = entry.new_path.as_deref().unwrap_or("");
431        let mut old_path_buf = old_path.as_bytes().to_vec();
432        let mut new_path_buf = new_path.as_bytes().to_vec();
433        let len1 = remove_space_bytes(&mut old_path_buf);
434        let len2 = remove_space_bytes(&mut new_path_buf);
435
436        let old_mode = parse_mode_u32(&entry.old_mode);
437        let new_mode = parse_mode_u32(&entry.new_mode);
438
439        let mut ctx = Sha1::new();
440        patch_id_add_string(&mut ctx, b"diff--git");
441        patch_id_add_string(&mut ctx, b"a/");
442        ctx.update(&old_path_buf[..len1]);
443        patch_id_add_string(&mut ctx, b"b/");
444        ctx.update(&new_path_buf[..len2]);
445
446        if old_mode == 0 {
447            patch_id_add_string(&mut ctx, b"newfilemode");
448            patch_id_add_mode(&mut ctx, new_mode);
449        } else if new_mode == 0 {
450            patch_id_add_string(&mut ctx, b"deletedfilemode");
451            patch_id_add_mode(&mut ctx, old_mode);
452        } else if old_mode != new_mode {
453            patch_id_add_string(&mut ctx, b"oldmode");
454            patch_id_add_mode(&mut ctx, old_mode);
455            patch_id_add_string(&mut ctx, b"newmode");
456            patch_id_add_mode(&mut ctx, new_mode);
457        }
458
459        let old_bytes = read_blob(odb, &entry.old_oid)?;
460        let new_bytes = read_blob(odb, &entry.new_oid)?;
461
462        if merge_file::is_binary(&old_bytes) || merge_file::is_binary(&new_bytes) {
463            let a = entry.old_oid.to_hex();
464            let b = entry.new_oid.to_hex();
465            ctx.update(a.as_bytes());
466            ctx.update(b.as_bytes());
467        } else {
468            let old_str = std::str::from_utf8(&old_bytes).unwrap_or("");
469            let new_str = std::str::from_utf8(&new_bytes).unwrap_or("");
470
471            if old_mode == 0 {
472                patch_id_add_string(&mut ctx, b"---/dev/null");
473                patch_id_add_string(&mut ctx, b"+++b/");
474                ctx.update(&new_path_buf[..len2]);
475            } else if new_mode == 0 {
476                patch_id_add_string(&mut ctx, b"---a/");
477                ctx.update(&old_path_buf[..len1]);
478                patch_id_add_string(&mut ctx, b"+++/dev/null");
479            } else {
480                patch_id_add_string(&mut ctx, b"---a/");
481                ctx.update(&old_path_buf[..len1]);
482                patch_id_add_string(&mut ctx, b"+++b/");
483                ctx.update(&new_path_buf[..len2]);
484            }
485
486            let diff = TextDiff::from_lines(old_str, new_str);
487            for change in diff.iter_all_changes() {
488                let prefix = match change.tag() {
489                    ChangeTag::Equal => b' ',
490                    ChangeTag::Delete => b'-',
491                    ChangeTag::Insert => b'+',
492                };
493                let text = change.as_str().unwrap_or("");
494                for piece in text.split_inclusive('\n') {
495                    let line_body = piece.strip_suffix('\n').unwrap_or(piece);
496                    let mut line_buf = Vec::with_capacity(1 + line_body.len() + 1);
497                    line_buf.push(prefix);
498                    line_buf.extend_from_slice(line_body.as_bytes());
499                    line_buf.push(b'\n');
500                    let n = remove_space_bytes(&mut line_buf);
501                    ctx.update(&line_buf[..n]);
502                }
503            }
504        }
505
506        text_flush_one_hunk(&mut result, &mut ctx);
507    }
508
509    ObjectId::from_bytes(&result).map(Some)
510}
511
512fn parse_mode_u32(mode: &str) -> u32 {
513    u32::from_str_radix(mode.trim(), 8).unwrap_or(0)
514}
515
516fn patch_id_add_string(ctx: &mut Sha1, s: &[u8]) {
517    ctx.update(s);
518}
519
520fn patch_id_add_mode(ctx: &mut Sha1, mode: u32) {
521    let text = format!("{mode:06o}");
522    ctx.update(text.as_bytes());
523}
524
525/// Strip ASCII whitespace in-place; returns new length (prefix of `buf` is valid).
526fn remove_space_bytes(buf: &mut Vec<u8>) -> usize {
527    let mut dst = 0usize;
528    for i in 0..buf.len() {
529        let c = buf[i];
530        if !c.is_ascii_whitespace() {
531            buf[dst] = c;
532            dst += 1;
533        }
534    }
535    dst
536}
537
538/// Read a blob's raw bytes from the ODB.
539///
540/// Returns an empty `Vec` for the zero OID (representing an absent file).
541fn read_blob(odb: &Odb, oid: &ObjectId) -> Result<Vec<u8>> {
542    if *oid == zero_oid() {
543        return Ok(Vec::new());
544    }
545    let obj = odb.read(oid)?;
546    Ok(obj.data)
547}