Skip to main content

grit_lib/
patch_ids.rs

1//! Patch-ID computation for commit equivalence detection.
2//!
3//! A patch-ID is a SHA-1 digest of the normalised diff a commit introduces.
4//! Whitespace is stripped from every changed line before hashing, so two
5//! commits whose diffs differ only in whitespace (spaces, tabs, newlines)
6//! produce identical patch-IDs.  This is the semantics required by
7//! `git cherry` and `git format-patch --ignore-if-in-upstream`.
8//!
9//! Two complementary entry points are provided:
10//!
11//! - [`compute_patch_id`] operates on a repository's object database, computing
12//!   the diff from the commit's tree objects.
13//! - [`compute_patch_ids_from_text`] parses unified diff text from stdin (e.g.
14//!   output of `git log -p` or `git diff-tree --patch --stdin`), matching the
15//!   behaviour of `git patch-id`.
16
17use sha1::{Digest, Sha1};
18use similar::{ChangeTag, TextDiff};
19
20use crate::diff::{diff_trees, zero_oid};
21use crate::error::Result;
22use crate::objects::{parse_commit, ObjectId, ObjectKind};
23use crate::odb::Odb;
24
25/// How to compute a patch-ID from unified diff text.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum PatchIdMode {
28    /// Accumulate file diffs into one rolling SHA-1; file order affects the
29    /// result.  Compatible with Git 1.9 and older.  This is the default.
30    Unstable,
31    /// Hash each file's diff independently and accumulate results with a
32    /// carry-addition, so file order does not affect the patch-ID.
33    Stable,
34    /// Like [`PatchIdMode::Stable`] but whitespace is not stripped before
35    /// hashing.
36    Verbatim,
37}
38
39/// Compute patch-IDs from unified diff text (e.g. `git log -p` output).
40///
41/// Parses the text line by line.  Each time a `commit <oid>` or
42/// `From <oid> …` marker is found the accumulated hash for the previous patch
43/// is finalised and emitted as `(patch_id, commit_id)`.  Commits with no diff
44/// content (empty patches, merge commits) are silently skipped.
45///
46/// # Parameters
47///
48/// - `input` — raw bytes of the unified diff stream.
49/// - `mode`  — which patch-ID algorithm to use (see [`PatchIdMode`]).
50///
51/// # Returns
52///
53/// A `Vec` of `(patch_id, commit_id)` pairs in the order they were encountered
54/// in the stream.
55pub fn compute_patch_ids_from_text(input: &[u8], mode: PatchIdMode) -> Vec<(ObjectId, ObjectId)> {
56    let stable = mode != PatchIdMode::Unstable;
57    let verbatim = mode == PatchIdMode::Verbatim;
58
59    let mut results: Vec<(ObjectId, ObjectId)> = Vec::new();
60
61    // Current accumulated state for the patch being processed.
62    let mut ctx = Sha1::new();
63    let mut result = [0u8; 20];
64    let mut patchlen: usize = 0;
65    // before/after: -1 = parsing file header, 0 = awaiting @@ hunk, >0 = in hunk
66    let mut before: i32 = -1;
67    let mut after: i32 = -1;
68    let mut diff_is_binary = false;
69    let mut pre_oid_str = String::new();
70    let mut post_oid_str = String::new();
71    let mut current_commit: Option<ObjectId> = None;
72    // When the input starts directly with a diff (no commit header),
73    // use the zero OID as the commit ID.
74    let mut implicit_commit = true;
75
76    // Iterate lines; we keep the trailing '\n' so remove_space mirrors git's
77    // behaviour (newline counts as whitespace and is stripped).
78    let lines = split_lines_with_nl(input);
79
80    let mut i = 0;
81    while i < lines.len() {
82        let raw = lines[i];
83        i += 1;
84
85        // The line as a &str for prefix checks; non-UTF-8 treated as empty.
86        let line = std::str::from_utf8(raw).unwrap_or("");
87
88        // Try to strip "commit " or "From " prefix to reach a potential OID.
89        let oid_candidate: Option<&str> = if let Some(rest) = line.strip_prefix("commit ") {
90            Some(rest)
91        } else if let Some(rest) = line.strip_prefix("From ") {
92            Some(rest)
93        } else {
94            None
95        };
96
97        if let Some(candidate) = oid_candidate {
98            if let Some(oid) = try_parse_oid_prefix(candidate) {
99                // Finalise the patch we've been accumulating.
100                text_flush_one_hunk(&mut result, &mut ctx);
101                if patchlen > 0 {
102                    if let Some(coid) = current_commit.take() {
103                        if let Ok(pid) = ObjectId::from_bytes(&result) {
104                            results.push((pid, coid));
105                        }
106                    }
107                }
108                // Reset for the new patch.
109                result = [0u8; 20];
110                ctx = Sha1::new();
111                patchlen = 0;
112                before = -1;
113                after = -1;
114                diff_is_binary = false;
115                pre_oid_str.clear();
116                post_oid_str.clear();
117                current_commit = Some(oid);
118                implicit_commit = false;
119                continue;
120            }
121        }
122
123        // "\ No newline at end of file" markers.
124        if line.starts_with("\\ ") && line.len() > 12 {
125            if verbatim {
126                ctx.update(raw);
127                patchlen += raw.len();
128            }
129            continue;
130        }
131
132        // Skip commit metadata before the first diff hunk.
133        if patchlen == 0 && !line.starts_with("diff ") {
134            continue;
135        }
136
137        // If we see a diff line without a preceding commit marker,
138        // treat it as a single patch with zero commit OID.
139        if implicit_commit && line.starts_with("diff ") && current_commit.is_none() {
140            current_commit = Some(ObjectId::from_bytes(&[0u8; 20]).unwrap());
141            implicit_commit = false;
142        }
143
144        // Parsing the per-file header (before == -1).
145        if before == -1 {
146            if line.starts_with("GIT binary patch") || line.starts_with("Binary files") {
147                // Binary: only hash the blob OID strings.
148                diff_is_binary = true;
149                before = 0;
150                let pre = pre_oid_str.clone();
151                let post = post_oid_str.clone();
152                ctx.update(pre.as_bytes());
153                ctx.update(post.as_bytes());
154                patchlen += pre.len() + post.len();
155                if stable {
156                    text_flush_one_hunk(&mut result, &mut ctx);
157                }
158                continue;
159            } else if let Some(rest) = line.strip_prefix("index ") {
160                // index <pre>..<post>[  <mode>]
161                if let Some(dd) = rest.find("..") {
162                    pre_oid_str = rest[..dd].to_owned();
163                    let tail = &rest[dd + 2..];
164                    let end = tail
165                        .find(|c: char| c.is_ascii_whitespace())
166                        .unwrap_or_else(|| {
167                            tail.trim_end_matches('\n').trim_end_matches('\r').len()
168                        });
169                    post_oid_str = tail[..end].to_owned();
170                }
171                continue;
172            } else if line.starts_with("--- ") {
173                before = 1;
174                after = 1;
175                // Fall through to hunk-content processing below.
176            } else if !line.chars().next().is_some_and(|c| c.is_ascii_alphabetic()) {
177                // Non-alpha first char signals end of this patch's diffs.
178                // Re-use the `continue` path; treat as patch boundary.
179                text_flush_one_hunk(&mut result, &mut ctx);
180                if patchlen > 0 {
181                    if let Some(coid) = current_commit.take() {
182                        if let Ok(pid) = ObjectId::from_bytes(&result) {
183                            results.push((pid, coid));
184                        }
185                    }
186                }
187                result = [0u8; 20];
188                ctx = Sha1::new();
189                patchlen = 0;
190                before = -1;
191                after = -1;
192                diff_is_binary = false;
193                continue;
194            }
195        }
196
197        // Skip body of binary diffs; reset on the next `diff ` line.
198        if diff_is_binary {
199            if line.starts_with("diff ") {
200                diff_is_binary = false;
201                before = -1;
202                // Process this `diff ` line as a new file header.
203                i -= 1; // re-process
204            }
205            continue;
206        }
207
208        // Waiting for a hunk header or the next file.
209        if before == 0 && after == 0 {
210            if line.starts_with("@@ -") {
211                let (b, a) = scan_hunk_header(line);
212                before = b;
213                after = a;
214                continue;
215            }
216            if !line.starts_with("diff ") {
217                // End of this file's content; nothing special to do—
218                // if it's a commit boundary the loop head will handle it.
219                continue;
220            }
221            // Another file diff starts; flush per-file hash in stable mode.
222            if stable {
223                text_flush_one_hunk(&mut result, &mut ctx);
224            }
225            before = -1;
226            after = -1;
227            // Re-process this `diff ` line as a new file header.
228            i -= 1;
229            continue;
230        }
231
232        // Inside a hunk — update the line counters.
233        let first = raw.first().copied().unwrap_or(b' ');
234        if first == b'-' || first == b' ' {
235            before -= 1;
236        }
237        if first == b'+' || first == b' ' {
238            after -= 1;
239        }
240
241        // Hash the line, optionally stripping whitespace.
242        let hashed = if verbatim {
243            ctx.update(raw);
244            raw.len()
245        } else {
246            hash_without_whitespace(&mut ctx, raw)
247        };
248        patchlen += hashed;
249    }
250
251    // Flush the final patch.
252    text_flush_one_hunk(&mut result, &mut ctx);
253    if patchlen > 0 {
254        if let Some(coid) = current_commit {
255            if let Ok(pid) = ObjectId::from_bytes(&result) {
256                results.push((pid, coid));
257            }
258        }
259    }
260
261    results
262}
263
264/// Finalise `ctx`, accumulate its digest into `result` with byte-wise
265/// carry-addition (mirrors git's `flush_one_hunk`), and reset `ctx`.
266fn text_flush_one_hunk(result: &mut [u8; 20], ctx: &mut Sha1) {
267    let old = std::mem::replace(ctx, Sha1::new());
268    let hash: [u8; 20] = old.finalize().into();
269    let mut carry: u16 = 0;
270    for i in 0..20 {
271        carry = carry + result[i] as u16 + hash[i] as u16;
272        result[i] = carry as u8;
273        carry >>= 8;
274    }
275}
276
277/// Hash `raw` bytes into `ctx`, skipping ASCII whitespace.
278///
279/// Returns the number of non-whitespace bytes fed to the hasher.
280fn hash_without_whitespace(ctx: &mut Sha1, raw: &[u8]) -> usize {
281    let mut count = 0;
282    for &b in raw {
283        if !b.is_ascii_whitespace() {
284            ctx.update([b]);
285            count += 1;
286        }
287    }
288    count
289}
290
291/// Parse a 40-hex OID at the start of `s`.
292///
293/// Returns `None` if `s` does not start with exactly 40 lowercase hex digits
294/// optionally followed by ASCII whitespace.
295fn try_parse_oid_prefix(s: &str) -> Option<ObjectId> {
296    let s = s.trim_end_matches('\n').trim_end_matches('\r');
297    if s.len() < 40 {
298        return None;
299    }
300    let hex = &s[..40];
301    if !hex.bytes().all(|b| b.is_ascii_hexdigit()) {
302        return None;
303    }
304    // The character after the OID (if any) must be whitespace or end of string.
305    if s.len() > 40 && !s.as_bytes()[40].is_ascii_whitespace() {
306        return None;
307    }
308    let mut bytes = [0u8; 20];
309    for (i, chunk) in hex.as_bytes().chunks(2).enumerate() {
310        let hi = hex_val(chunk[0])?;
311        let lo = hex_val(chunk[1])?;
312        bytes[i] = (hi << 4) | lo;
313    }
314    ObjectId::from_bytes(&bytes).ok()
315}
316
317/// Convert a single ASCII hex digit to its value.
318fn hex_val(b: u8) -> Option<u8> {
319    match b {
320        b'0'..=b'9' => Some(b - b'0'),
321        b'a'..=b'f' => Some(b - b'a' + 10),
322        b'A'..=b'F' => Some(b - b'A' + 10),
323        _ => None,
324    }
325}
326
327/// Parse the before/after line counts from a `@@ -<old>[,<n>] +<new>[,<m>] @@` header.
328///
329/// Returns `(before, after)` counts, defaulting to 1 when the count is absent.
330fn scan_hunk_header(line: &str) -> (i32, i32) {
331    // line starts with "@@ -"
332    let rest = match line.strip_prefix("@@ -") {
333        Some(r) => r,
334        None => return (1, 1),
335    };
336    // Parse old count: skip start line number, grab optional comma + count.
337    let before = parse_hunk_count(rest);
338    // Find " +" separator.
339    let after = rest
340        .find(" +")
341        .and_then(|p| parse_hunk_count_opt(&rest[p + 2..]))
342        .unwrap_or(1);
343    (before, after)
344}
345
346/// Parse `<start>[,<count>]` and return `count` (or 1 if absent).
347fn parse_hunk_count(s: &str) -> i32 {
348    // Skip digits of the start line number.
349    let after_start = s.trim_start_matches(|c: char| c.is_ascii_digit());
350    if let Some(rest) = after_start.strip_prefix(',') {
351        rest.split(|c: char| !c.is_ascii_digit())
352            .next()
353            .and_then(|n| n.parse().ok())
354            .unwrap_or(1)
355    } else {
356        1
357    }
358}
359
360/// Same as [`parse_hunk_count`] but returns `Option`.
361fn parse_hunk_count_opt(s: &str) -> Option<i32> {
362    Some(parse_hunk_count(s))
363}
364
365/// Split `input` into lines, preserving the trailing `\n` on each line.
366///
367/// The final slice may lack a trailing newline if the input doesn't end with
368/// one.
369fn split_lines_with_nl(input: &[u8]) -> Vec<&[u8]> {
370    let mut lines = Vec::new();
371    let mut start = 0;
372    for (i, &b) in input.iter().enumerate() {
373        if b == b'\n' {
374            lines.push(&input[start..=i]);
375            start = i + 1;
376        }
377    }
378    if start < input.len() {
379        lines.push(&input[start..]);
380    }
381    lines
382}
383
384/// Compute the patch-ID for a single commit.
385///
386/// Returns `None` for merge commits (more than one parent), since those do not
387/// have a well-defined single-parent diff.  Root commits (no parents) are
388/// compared against the empty tree.
389///
390/// # Parameters
391///
392/// - `odb` — object database used to read commit, tree, and blob objects.
393/// - `commit_oid` — OID of the commit to compute the patch-ID for.
394///
395/// # Errors
396///
397/// Returns errors from object-database reads or object-parse failures.
398pub fn compute_patch_id(odb: &Odb, commit_oid: &ObjectId) -> Result<Option<ObjectId>> {
399    let obj = odb.read(commit_oid)?;
400    if obj.kind != ObjectKind::Commit {
401        return Ok(None);
402    }
403    let commit = parse_commit(&obj.data)?;
404
405    // Merge commits (>1 parent) have no defined patch-id.
406    if commit.parents.len() > 1 {
407        return Ok(None);
408    }
409
410    // Resolve the parent tree (None = empty tree for root commits).
411    let parent_tree_oid = if commit.parents.is_empty() {
412        None
413    } else {
414        let parent_obj = odb.read(&commit.parents[0])?;
415        let parent_commit = parse_commit(&parent_obj.data)?;
416        Some(parent_commit.tree)
417    };
418
419    // Compute tree-to-tree diff.
420    let mut diffs = diff_trees(odb, parent_tree_oid.as_ref(), Some(&commit.tree), "")?;
421
422    // Sort by primary path (lexicographic), matching diffcore_std ordering.
423    diffs.sort_by(|a, b| a.path().cmp(b.path()));
424
425    let mut hasher = Sha1::new();
426
427    for entry in &diffs {
428        // Determine src and dst path strings (both sides always carry the
429        // logical path, even for pure adds/deletes).
430        let src = entry
431            .old_path
432            .as_deref()
433            .or(entry.new_path.as_deref())
434            .unwrap_or("");
435        let dst = entry
436            .new_path
437            .as_deref()
438            .or(entry.old_path.as_deref())
439            .unwrap_or("");
440
441        // Compact paths: remove all ASCII-whitespace (mirrors git's remove_space).
442        let src_compact = compact_path(src);
443        let dst_compact = compact_path(dst);
444
445        // Hash the diff header line.
446        let header = format!("diff --git a/{src_compact} b/{dst_compact}\n");
447        hasher.update(header.as_bytes());
448
449        // Read blob content for the old and new sides.
450        let old_bytes = read_blob(odb, &entry.old_oid)?;
451        let new_bytes = read_blob(odb, &entry.new_oid)?;
452
453        // Convert to str slices for the line-diff algorithm; treat non-UTF-8
454        // content as empty (binary blobs only contribute their header).
455        let old_str = std::str::from_utf8(&old_bytes).unwrap_or("");
456        let new_str = std::str::from_utf8(&new_bytes).unwrap_or("");
457
458        // Hash non-whitespace bytes from every added and deleted line.
459        let diff = TextDiff::from_lines(old_str, new_str);
460        for change in diff.iter_all_changes() {
461            match change.tag() {
462                ChangeTag::Delete | ChangeTag::Insert => {
463                    let line = change.as_str().unwrap_or("");
464                    for &byte in line.as_bytes() {
465                        if !byte.is_ascii_whitespace() {
466                            hasher.update([byte]);
467                        }
468                    }
469                }
470                ChangeTag::Equal => {}
471            }
472        }
473    }
474
475    let digest = hasher.finalize();
476    ObjectId::from_bytes(&digest).map(Some)
477}
478
479/// Remove all ASCII-whitespace characters from a path string.
480///
481/// Mirrors git's `remove_space()` used when hashing diff headers.
482fn compact_path(path: &str) -> String {
483    path.bytes()
484        .filter(|b| !b.is_ascii_whitespace())
485        .map(|b| b as char)
486        .collect()
487}
488
489/// Read a blob's raw bytes from the ODB.
490///
491/// Returns an empty `Vec` for the zero OID (representing an absent file).
492fn read_blob(odb: &Odb, oid: &ObjectId) -> Result<Vec<u8>> {
493    if *oid == zero_oid() {
494        return Ok(Vec::new());
495    }
496    let obj = odb.read(oid)?;
497    Ok(obj.data)
498}