Skip to main content

grit_lib/
patch_ids.rs

1//! Patch-ID computation for commit equivalence detection.
2//!
3//! A patch-ID is a SHA-1 digest of the normalised diff a commit introduces.
4//! Whitespace is stripped from every changed line before hashing, so two
5//! commits whose diffs differ only in whitespace (spaces, tabs, newlines)
6//! produce identical patch-IDs.  This is the semantics required by
7//! `git cherry` and `git format-patch --ignore-if-in-upstream`.
8//!
9//! Two complementary entry points are provided:
10//!
11//! - [`compute_patch_id`] operates on a repository's object database, computing
12//!   the diff from the commit's tree objects.
13//! - [`compute_patch_ids_from_text`] parses unified diff text from stdin (e.g.
14//!   output of `git log -p` or `git diff-tree --patch --stdin`), matching the
15//!   behaviour of `git patch-id`.
16
17use sha1::{Digest, Sha1};
18use similar::{ChangeTag, TextDiff};
19
20use crate::diff::{diff_trees, zero_oid};
21use crate::error::Result;
22use crate::objects::{parse_commit, ObjectId, ObjectKind};
23use crate::odb::Odb;
24
25/// How to compute a patch-ID from unified diff text.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum PatchIdMode {
28    /// Accumulate file diffs into one rolling SHA-1; file order affects the
29    /// result.  Compatible with Git 1.9 and older.  This is the default.
30    Unstable,
31    /// Hash each file's diff independently and accumulate results with a
32    /// carry-addition, so file order does not affect the patch-ID.
33    Stable,
34    /// Like [`PatchIdMode::Stable`] but whitespace is not stripped before
35    /// hashing.
36    Verbatim,
37}
38
39/// Compute patch-IDs from unified diff text (e.g. `git log -p` output).
40///
41/// Parses the text line by line.  Each time a `commit <oid>` or
42/// `From <oid> …` marker is found the accumulated hash for the previous patch
43/// is finalised and emitted as `(patch_id, commit_id)`.  Commits with no diff
44/// content (empty patches, merge commits) are silently skipped.
45///
46/// # Parameters
47///
48/// - `input` — raw bytes of the unified diff stream.
49/// - `mode`  — which patch-ID algorithm to use (see [`PatchIdMode`]).
50///
51/// # Returns
52///
53/// A `Vec` of `(patch_id, commit_id)` pairs in the order they were encountered
54/// in the stream.
55pub fn compute_patch_ids_from_text(input: &[u8], mode: PatchIdMode) -> Vec<(ObjectId, ObjectId)> {
56    let stable = mode != PatchIdMode::Unstable;
57    let verbatim = mode == PatchIdMode::Verbatim;
58
59    let mut results: Vec<(ObjectId, ObjectId)> = Vec::new();
60
61    // Current accumulated state for the patch being processed.
62    let mut ctx = Sha1::new();
63    let mut result = [0u8; 20];
64    let mut patchlen: usize = 0;
65    // before/after: -1 = parsing file header, 0 = awaiting @@ hunk, >0 = in hunk
66    let mut before: i32 = -1;
67    let mut after: i32 = -1;
68    let mut diff_is_binary = false;
69    let mut pre_oid_str = String::new();
70    let mut post_oid_str = String::new();
71    let mut current_commit: Option<ObjectId> = None;
72
73    // Iterate lines; we keep the trailing '\n' so remove_space mirrors git's
74    // behaviour (newline counts as whitespace and is stripped).
75    let lines = split_lines_with_nl(input);
76
77    let mut i = 0;
78    while i < lines.len() {
79        let raw = lines[i];
80        i += 1;
81
82        // The line as a &str for prefix checks; non-UTF-8 treated as empty.
83        let line = std::str::from_utf8(raw).unwrap_or("");
84
85        // Try to strip "commit " or "From " prefix to reach a potential OID.
86        let oid_candidate: Option<&str> = if let Some(rest) = line.strip_prefix("commit ") {
87            Some(rest)
88        } else if let Some(rest) = line.strip_prefix("From ") {
89            Some(rest)
90        } else {
91            None
92        };
93
94        if let Some(candidate) = oid_candidate {
95            if let Some(oid) = try_parse_oid_prefix(candidate) {
96                // Finalise the patch we've been accumulating.
97                text_flush_one_hunk(&mut result, &mut ctx);
98                if patchlen > 0 {
99                    if let Some(coid) = current_commit.take() {
100                        if let Ok(pid) = ObjectId::from_bytes(&result) {
101                            results.push((pid, coid));
102                        }
103                    }
104                }
105                // Reset for the new patch.
106                result = [0u8; 20];
107                ctx = Sha1::new();
108                patchlen = 0;
109                before = -1;
110                after = -1;
111                diff_is_binary = false;
112                pre_oid_str.clear();
113                post_oid_str.clear();
114                current_commit = Some(oid);
115                continue;
116            }
117        }
118
119        // "\ No newline at end of file" markers.
120        if line.starts_with("\\ ") && line.len() > 12 {
121            if verbatim {
122                ctx.update(raw);
123                patchlen += raw.len();
124            }
125            continue;
126        }
127
128        // Skip commit metadata before the first diff hunk.
129        if patchlen == 0 && !line.starts_with("diff ") {
130            continue;
131        }
132
133        // Parsing the per-file header (before == -1).
134        if before == -1 {
135            if line.starts_with("GIT binary patch") || line.starts_with("Binary files") {
136                // Binary: only hash the blob OID strings.
137                diff_is_binary = true;
138                before = 0;
139                let pre = pre_oid_str.clone();
140                let post = post_oid_str.clone();
141                ctx.update(pre.as_bytes());
142                ctx.update(post.as_bytes());
143                patchlen += pre.len() + post.len();
144                if stable {
145                    text_flush_one_hunk(&mut result, &mut ctx);
146                }
147                continue;
148            } else if let Some(rest) = line.strip_prefix("index ") {
149                // index <pre>..<post>[  <mode>]
150                if let Some(dd) = rest.find("..") {
151                    pre_oid_str = rest[..dd].to_owned();
152                    let tail = &rest[dd + 2..];
153                    let end = tail
154                        .find(|c: char| c.is_ascii_whitespace())
155                        .unwrap_or_else(|| {
156                            tail.trim_end_matches('\n').trim_end_matches('\r').len()
157                        });
158                    post_oid_str = tail[..end].to_owned();
159                }
160                continue;
161            } else if line.starts_with("--- ") {
162                before = 1;
163                after = 1;
164                // Fall through to hunk-content processing below.
165            } else if !line
166                .chars()
167                .next()
168                .is_some_and(|c| c.is_ascii_alphabetic())
169            {
170                // Non-alpha first char signals end of this patch's diffs.
171                // Re-use the `continue` path; treat as patch boundary.
172                text_flush_one_hunk(&mut result, &mut ctx);
173                if patchlen > 0 {
174                    if let Some(coid) = current_commit.take() {
175                        if let Ok(pid) = ObjectId::from_bytes(&result) {
176                            results.push((pid, coid));
177                        }
178                    }
179                }
180                result = [0u8; 20];
181                ctx = Sha1::new();
182                patchlen = 0;
183                before = -1;
184                after = -1;
185                diff_is_binary = false;
186                continue;
187            }
188        }
189
190        // Skip body of binary diffs; reset on the next `diff ` line.
191        if diff_is_binary {
192            if line.starts_with("diff ") {
193                diff_is_binary = false;
194                before = -1;
195                // Process this `diff ` line as a new file header.
196                i -= 1; // re-process
197            }
198            continue;
199        }
200
201        // Waiting for a hunk header or the next file.
202        if before == 0 && after == 0 {
203            if line.starts_with("@@ -") {
204                let (b, a) = scan_hunk_header(line);
205                before = b;
206                after = a;
207                continue;
208            }
209            if !line.starts_with("diff ") {
210                // End of this file's content; nothing special to do—
211                // if it's a commit boundary the loop head will handle it.
212                continue;
213            }
214            // Another file diff starts; flush per-file hash in stable mode.
215            if stable {
216                text_flush_one_hunk(&mut result, &mut ctx);
217            }
218            before = -1;
219            after = -1;
220            // Re-process this `diff ` line as a new file header.
221            i -= 1;
222            continue;
223        }
224
225        // Inside a hunk — update the line counters.
226        let first = raw.first().copied().unwrap_or(b' ');
227        if first == b'-' || first == b' ' {
228            before -= 1;
229        }
230        if first == b'+' || first == b' ' {
231            after -= 1;
232        }
233
234        // Hash the line, optionally stripping whitespace.
235        let hashed = if verbatim {
236            ctx.update(raw);
237            raw.len()
238        } else {
239            hash_without_whitespace(&mut ctx, raw)
240        };
241        patchlen += hashed;
242    }
243
244    // Flush the final patch.
245    text_flush_one_hunk(&mut result, &mut ctx);
246    if patchlen > 0 {
247        if let Some(coid) = current_commit {
248            if let Ok(pid) = ObjectId::from_bytes(&result) {
249                results.push((pid, coid));
250            }
251        }
252    }
253
254    results
255}
256
257/// Finalise `ctx`, accumulate its digest into `result` with byte-wise
258/// carry-addition (mirrors git's `flush_one_hunk`), and reset `ctx`.
259fn text_flush_one_hunk(result: &mut [u8; 20], ctx: &mut Sha1) {
260    let old = std::mem::replace(ctx, Sha1::new());
261    let hash: [u8; 20] = old.finalize().into();
262    let mut carry: u16 = 0;
263    for i in 0..20 {
264        carry = carry + result[i] as u16 + hash[i] as u16;
265        result[i] = carry as u8;
266        carry >>= 8;
267    }
268}
269
270/// Hash `raw` bytes into `ctx`, skipping ASCII whitespace.
271///
272/// Returns the number of non-whitespace bytes fed to the hasher.
273fn hash_without_whitespace(ctx: &mut Sha1, raw: &[u8]) -> usize {
274    let mut count = 0;
275    for &b in raw {
276        if !b.is_ascii_whitespace() {
277            ctx.update([b]);
278            count += 1;
279        }
280    }
281    count
282}
283
284/// Parse a 40-hex OID at the start of `s`.
285///
286/// Returns `None` if `s` does not start with exactly 40 lowercase hex digits
287/// optionally followed by ASCII whitespace.
288fn try_parse_oid_prefix(s: &str) -> Option<ObjectId> {
289    let s = s.trim_end_matches('\n').trim_end_matches('\r');
290    if s.len() < 40 {
291        return None;
292    }
293    let hex = &s[..40];
294    if !hex.bytes().all(|b| b.is_ascii_hexdigit()) {
295        return None;
296    }
297    // The character after the OID (if any) must be whitespace or end of string.
298    if s.len() > 40 && !s.as_bytes()[40].is_ascii_whitespace() {
299        return None;
300    }
301    let mut bytes = [0u8; 20];
302    for (i, chunk) in hex.as_bytes().chunks(2).enumerate() {
303        let hi = hex_val(chunk[0])?;
304        let lo = hex_val(chunk[1])?;
305        bytes[i] = (hi << 4) | lo;
306    }
307    ObjectId::from_bytes(&bytes).ok()
308}
309
310/// Convert a single ASCII hex digit to its value.
311fn hex_val(b: u8) -> Option<u8> {
312    match b {
313        b'0'..=b'9' => Some(b - b'0'),
314        b'a'..=b'f' => Some(b - b'a' + 10),
315        b'A'..=b'F' => Some(b - b'A' + 10),
316        _ => None,
317    }
318}
319
320/// Parse the before/after line counts from a `@@ -<old>[,<n>] +<new>[,<m>] @@` header.
321///
322/// Returns `(before, after)` counts, defaulting to 1 when the count is absent.
323fn scan_hunk_header(line: &str) -> (i32, i32) {
324    // line starts with "@@ -"
325    let rest = match line.strip_prefix("@@ -") {
326        Some(r) => r,
327        None => return (1, 1),
328    };
329    // Parse old count: skip start line number, grab optional comma + count.
330    let before = parse_hunk_count(rest);
331    // Find " +" separator.
332    let after = rest
333        .find(" +")
334        .and_then(|p| parse_hunk_count_opt(&rest[p + 2..]))
335        .unwrap_or(1);
336    (before, after)
337}
338
339/// Parse `<start>[,<count>]` and return `count` (or 1 if absent).
340fn parse_hunk_count(s: &str) -> i32 {
341    // Skip digits of the start line number.
342    let after_start = s.trim_start_matches(|c: char| c.is_ascii_digit());
343    if let Some(rest) = after_start.strip_prefix(',') {
344        rest.split(|c: char| !c.is_ascii_digit())
345            .next()
346            .and_then(|n| n.parse().ok())
347            .unwrap_or(1)
348    } else {
349        1
350    }
351}
352
353/// Same as [`parse_hunk_count`] but returns `Option`.
354fn parse_hunk_count_opt(s: &str) -> Option<i32> {
355    Some(parse_hunk_count(s))
356}
357
358/// Split `input` into lines, preserving the trailing `\n` on each line.
359///
360/// The final slice may lack a trailing newline if the input doesn't end with
361/// one.
362fn split_lines_with_nl(input: &[u8]) -> Vec<&[u8]> {
363    let mut lines = Vec::new();
364    let mut start = 0;
365    for (i, &b) in input.iter().enumerate() {
366        if b == b'\n' {
367            lines.push(&input[start..=i]);
368            start = i + 1;
369        }
370    }
371    if start < input.len() {
372        lines.push(&input[start..]);
373    }
374    lines
375}
376
377/// Compute the patch-ID for a single commit.
378///
379/// Returns `None` for merge commits (more than one parent), since those do not
380/// have a well-defined single-parent diff.  Root commits (no parents) are
381/// compared against the empty tree.
382///
383/// # Parameters
384///
385/// - `odb` — object database used to read commit, tree, and blob objects.
386/// - `commit_oid` — OID of the commit to compute the patch-ID for.
387///
388/// # Errors
389///
390/// Returns errors from object-database reads or object-parse failures.
391pub fn compute_patch_id(odb: &Odb, commit_oid: &ObjectId) -> Result<Option<ObjectId>> {
392    let obj = odb.read(commit_oid)?;
393    if obj.kind != ObjectKind::Commit {
394        return Ok(None);
395    }
396    let commit = parse_commit(&obj.data)?;
397
398    // Merge commits (>1 parent) have no defined patch-id.
399    if commit.parents.len() > 1 {
400        return Ok(None);
401    }
402
403    // Resolve the parent tree (None = empty tree for root commits).
404    let parent_tree_oid = if commit.parents.is_empty() {
405        None
406    } else {
407        let parent_obj = odb.read(&commit.parents[0])?;
408        let parent_commit = parse_commit(&parent_obj.data)?;
409        Some(parent_commit.tree)
410    };
411
412    // Compute tree-to-tree diff.
413    let mut diffs = diff_trees(odb, parent_tree_oid.as_ref(), Some(&commit.tree), "")?;
414
415    // Sort by primary path (lexicographic), matching diffcore_std ordering.
416    diffs.sort_by(|a, b| a.path().cmp(b.path()));
417
418    let mut hasher = Sha1::new();
419
420    for entry in &diffs {
421        // Determine src and dst path strings (both sides always carry the
422        // logical path, even for pure adds/deletes).
423        let src = entry
424            .old_path
425            .as_deref()
426            .or(entry.new_path.as_deref())
427            .unwrap_or("");
428        let dst = entry
429            .new_path
430            .as_deref()
431            .or(entry.old_path.as_deref())
432            .unwrap_or("");
433
434        // Compact paths: remove all ASCII-whitespace (mirrors git's remove_space).
435        let src_compact = compact_path(src);
436        let dst_compact = compact_path(dst);
437
438        // Hash the diff header line.
439        let header = format!("diff --git a/{src_compact} b/{dst_compact}\n");
440        hasher.update(header.as_bytes());
441
442        // Read blob content for the old and new sides.
443        let old_bytes = read_blob(odb, &entry.old_oid)?;
444        let new_bytes = read_blob(odb, &entry.new_oid)?;
445
446        // Convert to str slices for the line-diff algorithm; treat non-UTF-8
447        // content as empty (binary blobs only contribute their header).
448        let old_str = std::str::from_utf8(&old_bytes).unwrap_or("");
449        let new_str = std::str::from_utf8(&new_bytes).unwrap_or("");
450
451        // Hash non-whitespace bytes from every added and deleted line.
452        let diff = TextDiff::from_lines(old_str, new_str);
453        for change in diff.iter_all_changes() {
454            match change.tag() {
455                ChangeTag::Delete | ChangeTag::Insert => {
456                    let line = change.as_str().unwrap_or("");
457                    for &byte in line.as_bytes() {
458                        if !byte.is_ascii_whitespace() {
459                            hasher.update([byte]);
460                        }
461                    }
462                }
463                ChangeTag::Equal => {}
464            }
465        }
466    }
467
468    let digest = hasher.finalize();
469    ObjectId::from_bytes(&digest).map(Some)
470}
471
472/// Remove all ASCII-whitespace characters from a path string.
473///
474/// Mirrors git's `remove_space()` used when hashing diff headers.
475fn compact_path(path: &str) -> String {
476    path.bytes()
477        .filter(|b| !b.is_ascii_whitespace())
478        .map(|b| b as char)
479        .collect()
480}
481
482/// Read a blob's raw bytes from the ODB.
483///
484/// Returns an empty `Vec` for the zero OID (representing an absent file).
485fn read_blob(odb: &Odb, oid: &ObjectId) -> Result<Vec<u8>> {
486    if *oid == zero_oid() {
487        return Ok(Vec::new());
488    }
489    let obj = odb.read(oid)?;
490    Ok(obj.data)
491}