Skip to main content

git_lfs_git/
diff_index.rs

1//! `git diff-index -z` parser.
2//!
3//! Used by `git lfs status` to enumerate staged + unstaged changes
4//! against HEAD. The `-z` form is mandatory for correctness: paths can
5//! contain spaces, newlines, and quoting metacharacters; without `-z`,
6//! git would render those quoted and we'd have to undo the encoding.
7//!
8//! Output format (one record per change):
9//! ```text
10//! :<src-mode> <dst-mode> <src-sha> <dst-sha> <status>\0<src>\0[<dst>\0]
11//! ```
12//! `<status>` is a single letter A/M/D/R/C/T/U/X, optionally followed by
13//! a 1–3 digit similarity score for `R` and `C`. The trailing `<dst>`
14//! field is only present for renames and copies.
15
16use std::path::Path;
17use std::process::Command;
18
19use crate::Error;
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct DiffEntry {
23    pub src_sha: String,
24    pub dst_sha: String,
25    pub status: char,
26    /// Present only for `R` (rename) and `C` (copy); the similarity
27    /// score git computed.
28    pub similarity: Option<u16>,
29    pub src_name: String,
30    /// Present only for `R` and `C`.
31    pub dst_name: Option<String>,
32}
33
34impl DiffEntry {
35    /// The "current" path of this entry — `dst_name` for renames/copies
36    /// (which is the path the diff lands at), `src_name` otherwise.
37    pub fn path(&self) -> &str {
38        self.dst_name.as_deref().unwrap_or(&self.src_name)
39    }
40}
41
42/// Run `git diff-index -z [--cached] <ref>` and return the parsed entries.
43///
44/// `cached = true` reports staged changes (HEAD vs index); `cached = false`
45/// reports working-tree changes (HEAD vs working tree, including unstaged).
46///
47/// `-M` (rename detection) matches upstream's `git lfs status` behavior;
48/// without it, a `git mv` shows up as a delete + add pair instead of an `R`
49/// entry, which the JSON-shape tests rely on.
50pub fn diff_index(cwd: &Path, refname: &str, cached: bool) -> Result<Vec<DiffEntry>, Error> {
51    let mut cmd = Command::new("git");
52    cmd.arg("-C").arg(cwd).args(["diff-index", "-M", "-z"]);
53    if cached {
54        cmd.arg("--cached");
55    }
56    cmd.arg(refname);
57    let out = cmd.output()?;
58    if !out.status.success() {
59        return Err(Error::Failed(format!(
60            "git diff-index failed: {}",
61            String::from_utf8_lossy(&out.stderr).trim()
62        )));
63    }
64    parse(&out.stdout)
65}
66
67fn parse(bytes: &[u8]) -> Result<Vec<DiffEntry>, Error> {
68    // Strip the trailing NUL git always emits so the iterator below
69    // doesn't see a phantom empty token at the end.
70    let trimmed = bytes.strip_suffix(b"\0").unwrap_or(bytes);
71    if trimmed.is_empty() {
72        return Ok(Vec::new());
73    }
74
75    let mut tokens = trimmed.split(|&b| b == 0);
76    let mut entries = Vec::new();
77    while let Some(meta) = tokens.next() {
78        let meta_s = std::str::from_utf8(meta)
79            .map_err(|e| Error::Failed(format!("diff-index: non-utf8 metadata: {e}")))?;
80        let body = meta_s
81            .strip_prefix(':')
82            .ok_or_else(|| Error::Failed(format!("diff-index: missing ':' in {meta_s:?}")))?;
83        let parts: Vec<&str> = body.split_whitespace().collect();
84        if parts.len() != 5 {
85            return Err(Error::Failed(format!(
86                "diff-index: expected 5 metadata fields in {meta_s:?}, got {}",
87                parts.len()
88            )));
89        }
90        let src_sha = parts[2].to_owned();
91        let dst_sha = parts[3].to_owned();
92        let status_field = parts[4];
93        let status = status_field
94            .chars()
95            .next()
96            .ok_or_else(|| Error::Failed(format!("diff-index: empty status in {meta_s:?}")))?;
97        let similarity = if status_field.len() > 1 {
98            status_field[1..].parse::<u16>().ok()
99        } else {
100            None
101        };
102
103        let src = tokens
104            .next()
105            .ok_or_else(|| Error::Failed(format!("diff-index: missing src name for {meta_s:?}")))?;
106        let src_name = std::str::from_utf8(src)
107            .map_err(|e| Error::Failed(format!("diff-index: non-utf8 src name: {e}")))?
108            .to_owned();
109
110        let dst_name = if matches!(status, 'R' | 'C') {
111            let dst = tokens.next().ok_or_else(|| {
112                Error::Failed(format!(
113                    "diff-index: missing dst name for {status} record {meta_s:?}"
114                ))
115            })?;
116            Some(
117                std::str::from_utf8(dst)
118                    .map_err(|e| Error::Failed(format!("diff-index: non-utf8 dst name: {e}")))?
119                    .to_owned(),
120            )
121        } else {
122            None
123        };
124
125        entries.push(DiffEntry {
126            src_sha,
127            dst_sha,
128            status,
129            similarity,
130            src_name,
131            dst_name,
132        });
133    }
134    Ok(entries)
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn parse_empty_input() {
143        assert!(parse(b"").unwrap().is_empty());
144        assert!(parse(b"\0").unwrap().is_empty());
145    }
146
147    #[test]
148    fn parse_modification() {
149        let raw = b":100644 100644 abc 123 M\0file.txt\0";
150        let entries = parse(raw).unwrap();
151        assert_eq!(entries.len(), 1);
152        let e = &entries[0];
153        assert_eq!(e.src_sha, "abc");
154        assert_eq!(e.dst_sha, "123");
155        assert_eq!(e.status, 'M');
156        assert_eq!(e.similarity, None);
157        assert_eq!(e.src_name, "file.txt");
158        assert_eq!(e.dst_name, None);
159    }
160
161    #[test]
162    fn parse_addition_has_zero_src_sha() {
163        let raw = b":000000 100644 0000000 1234567 A\0new.bin\0";
164        let entries = parse(raw).unwrap();
165        assert_eq!(entries[0].status, 'A');
166        assert_eq!(entries[0].src_sha, "0000000");
167        assert_eq!(entries[0].dst_sha, "1234567");
168    }
169
170    #[test]
171    fn parse_rename_with_score_and_two_paths() {
172        let raw = b":100644 100644 abc 123 R86\0old/path.txt\0new/path.txt\0";
173        let entries = parse(raw).unwrap();
174        let e = &entries[0];
175        assert_eq!(e.status, 'R');
176        assert_eq!(e.similarity, Some(86));
177        assert_eq!(e.src_name, "old/path.txt");
178        assert_eq!(e.dst_name.as_deref(), Some("new/path.txt"));
179        assert_eq!(e.path(), "new/path.txt");
180    }
181
182    #[test]
183    fn parse_multiple_records() {
184        let raw = b":100644 100644 a 1 M\0a.txt\0\
185                   :100644 100644 b 2 M\0b.txt\0\
186                   :100644 100644 c 3 R100\0c.txt\0d.txt\0";
187        let entries = parse(raw).unwrap();
188        assert_eq!(entries.len(), 3);
189        assert_eq!(entries[0].src_name, "a.txt");
190        assert_eq!(entries[1].src_name, "b.txt");
191        assert_eq!(entries[2].status, 'R');
192        assert_eq!(entries[2].dst_name.as_deref(), Some("d.txt"));
193    }
194
195    #[test]
196    fn parse_path_with_embedded_special_chars() {
197        // With -z, paths are emitted raw — including newlines and tabs
198        // that would normally be quote-escaped without -z.
199        let raw = b":100644 100644 a 1 M\0name with\nnewline\0";
200        let entries = parse(raw).unwrap();
201        assert_eq!(entries[0].src_name, "name with\nnewline");
202    }
203
204    #[test]
205    fn parse_missing_colon_errors() {
206        let raw = b"100644 100644 a 1 M\0file\0";
207        assert!(parse(raw).is_err());
208    }
209
210    #[test]
211    fn parse_truncated_record_errors() {
212        // Status R, but no dst name follows — malformed.
213        let raw = b":100644 100644 a 1 R86\0only-src\0";
214        assert!(parse(raw).is_err());
215    }
216
217    #[test]
218    fn diff_index_against_real_repo_finds_staged_modification() {
219        use crate::tests::commit_helper::*;
220        let repo = init_repo();
221        commit_file(&repo, "a.txt", b"first");
222        // Modify and stage.
223        std::fs::write(repo.path().join("a.txt"), b"second").unwrap();
224        std::process::Command::new("git")
225            .arg("-C")
226            .arg(repo.path())
227            .args(["add", "a.txt"])
228            .status()
229            .unwrap();
230
231        let staged = diff_index(repo.path(), "HEAD", true).unwrap();
232        assert_eq!(staged.len(), 1, "{staged:?}");
233        assert_eq!(staged[0].status, 'M');
234        assert_eq!(staged[0].src_name, "a.txt");
235
236        // Working tree matches index, so unstaged diff is empty.
237        let unstaged = diff_index(repo.path(), "HEAD", false).unwrap();
238        // diff_index without --cached compares HEAD vs working tree, so
239        // this includes the staged change — the caller is responsible
240        // for deduping (which `status` does).
241        assert_eq!(unstaged.len(), 1);
242    }
243}