Skip to main content

grit_lib/
fast_export.rs

1//! [`git fast-export`](https://git-scm.com/docs/git-fast-export) stream generation.
2//!
3//! Supports the subset needed by upstream tests: `--all`, `--anonymize`,
4//! `--anonymize-map`, topological commit order with `reverse` (oldest first),
5//! blob/commit marks, per-commit tree diffs, and annotated tags on commits.
6
7use std::collections::{HashMap, HashSet};
8use std::io::Write;
9
10use crate::diff::{diff_trees, DiffEntry, DiffStatus};
11use crate::error::{Error, Result};
12use crate::objects::{parse_commit, parse_tag, CommitData, ObjectId, ObjectKind};
13use crate::refs;
14use crate::repo::Repository;
15use crate::rev_list::{rev_list, OrderingMode, RevListOptions};
16
17use crate::index::{MODE_GITLINK, MODE_TREE};
18
19/// Options for [`export_stream`].
20#[derive(Debug, Clone, Default)]
21pub struct FastExportOptions {
22    /// Export all heads under `refs/heads/` (and reachable history).
23    pub all: bool,
24    /// Replace paths, idents, messages, and non-mark OIDs with stable placeholders.
25    pub anonymize: bool,
26    /// `from:to` or bare `token` mappings (last duplicate key wins, matching Git).
27    pub anonymize_maps: Vec<String>,
28    /// Emit `feature done` / trailing `done` (matches `git fast-import` when the feature is negotiated).
29    pub use_done_feature: bool,
30    /// Omit `blob` commands and emit `M` lines with full object ids (matches `git fast-export --no-data`).
31    pub no_data: bool,
32}
33
34struct AnonState<'a> {
35    seeds: &'a HashMap<String, String>,
36    paths: HashMap<String, String>,
37    refs: HashMap<String, String>,
38    objs: HashMap<String, String>,
39    idents: HashMap<String, String>,
40    tag_msgs: HashMap<String, String>,
41    path_n: u32,
42    ref_n: u32,
43    oid_n: u32,
44    ident_n: u32,
45    subject_n: u32,
46    tag_msg_n: u32,
47    blob_n: u32,
48}
49
50impl<'a> AnonState<'a> {
51    fn new(seeds: &'a HashMap<String, String>) -> Self {
52        Self {
53            seeds,
54            paths: HashMap::new(),
55            refs: HashMap::new(),
56            objs: HashMap::new(),
57            idents: HashMap::new(),
58            tag_msgs: HashMap::new(),
59            path_n: 0,
60            ref_n: 0,
61            oid_n: 0,
62            ident_n: 0,
63            subject_n: 0,
64            tag_msg_n: 0,
65            blob_n: 0,
66        }
67    }
68
69    fn map_token(
70        map: &mut HashMap<String, String>,
71        seeds: &HashMap<String, String>,
72        key: &str,
73        gen: impl FnOnce() -> String,
74    ) -> String {
75        if let Some(v) = seeds.get(key) {
76            return v.clone();
77        }
78        if let Some(v) = map.get(key) {
79            return v.clone();
80        }
81        let v = gen();
82        map.insert(key.to_string(), v.clone());
83        v
84    }
85
86    fn path_seed_lookup(comp: &str, seeds: &HashMap<String, String>) -> Option<String> {
87        if let Some(v) = seeds.get(comp) {
88            return Some(v.clone());
89        }
90        if let Some(dot) = comp.find('.') {
91            let stem = &comp[..dot];
92            if let Some(v) = seeds.get(stem) {
93                let ext = &comp[dot..];
94                return Some(format!("{v}{ext}"));
95            }
96        }
97        None
98    }
99
100    fn anonymize_path_component(&mut self, comp: &str) -> String {
101        if let Some(mapped) = Self::path_seed_lookup(comp, self.seeds) {
102            return Self::map_token(&mut self.paths, &HashMap::new(), comp, || mapped);
103        }
104        Self::map_token(&mut self.paths, self.seeds, comp, || {
105            let n = self.path_n;
106            self.path_n += 1;
107            format!("path{n}")
108        })
109    }
110
111    fn anonymize_path(&mut self, path: &str) -> String {
112        if !path.is_empty() && self.seeds.contains_key(path) {
113            return self.seeds[path].clone();
114        }
115        let mut out = String::new();
116        for (i, part) in path.split('/').enumerate() {
117            if i > 0 {
118                out.push('/');
119            }
120            out.push_str(&self.anonymize_path_component(part));
121        }
122        out
123    }
124
125    fn anonymize_refname(&mut self, refname: &str) -> String {
126        const PREFIXES: &[&str] = &["refs/heads/", "refs/tags/", "refs/remotes/", "refs/"];
127        let mut rest = refname;
128        let mut prefix = "";
129        for p in PREFIXES {
130            if let Some(stripped) = refname.strip_prefix(p) {
131                prefix = p;
132                rest = stripped;
133                break;
134            }
135        }
136        let mut out = prefix.to_string();
137        if rest.is_empty() {
138            return out;
139        }
140        for (i, comp) in rest.split('/').enumerate() {
141            if i > 0 {
142                out.push('/');
143            }
144            out.push_str(&Self::map_token(&mut self.refs, self.seeds, comp, || {
145                let n = self.ref_n;
146                self.ref_n += 1;
147                format!("ref{n}")
148            }));
149        }
150        out
151    }
152
153    fn anonymize_oid_hex(&mut self, hex: &str) -> String {
154        Self::map_token(&mut self.objs, self.seeds, hex, || {
155            self.oid_n += 1;
156            format!("{:040x}", self.oid_n as u128)
157        })
158    }
159
160    fn anonymize_ident_line(&mut self, line: &str) -> String {
161        // "author NAME <EMAIL> DATE TZ" — preserve header word and date tail.
162        let Some(space) = line.find(' ') else {
163            return line.to_owned();
164        };
165        let header = &line[..space + 1];
166        let rest = line[space + 1..].trim_end();
167        let Some(gt) = rest.rfind('>') else {
168            return format!("{header}Malformed Ident <malformed@example.com> 0 -0000");
169        };
170        let name_email = &rest[..gt + 1];
171        let after = rest[gt + 1..].trim_start();
172        let key = name_email.to_string();
173        let ident = Self::map_token(&mut self.idents, self.seeds, &key, || {
174            let n = self.ident_n;
175            self.ident_n += 1;
176            format!("User {n} <user{n}@example.com>")
177        });
178        format!("{header}{ident} {after}")
179    }
180
181    fn anonymize_commit_message(&mut self) -> String {
182        let n = self.subject_n;
183        self.subject_n += 1;
184        format!("subject {n}\n\nbody\n")
185    }
186
187    fn anonymize_tag_message(&mut self, msg: &str) -> String {
188        Self::map_token(&mut self.tag_msgs, self.seeds, msg, || {
189            let n = self.tag_msg_n;
190            self.tag_msg_n += 1;
191            format!("tag message {n}")
192        })
193    }
194
195    fn anonymize_blob_payload(&mut self) -> Vec<u8> {
196        let n = self.blob_n;
197        self.blob_n += 1;
198        format!("anonymous blob {n}").into_bytes()
199    }
200}
201
202fn parse_anonymize_maps(entries: &[String]) -> Result<HashMap<String, String>> {
203    let mut out = HashMap::new();
204    for raw in entries {
205        let raw = raw.trim();
206        if raw.is_empty() {
207            return Err(Error::InvalidRef(
208                "--anonymize-map token cannot be empty".to_owned(),
209            ));
210        }
211        if let Some((k, v)) = raw.split_once(':') {
212            if k.is_empty() || v.is_empty() {
213                return Err(Error::InvalidRef(
214                    "--anonymize-map token cannot be empty".to_owned(),
215                ));
216            }
217            out.insert(k.to_string(), v.to_string());
218        } else {
219            out.insert(raw.to_string(), raw.to_string());
220        }
221    }
222    Ok(out)
223}
224
225/// Ref tips used to assign each exported commit a `commit <ref>` line (Git `revision_sources`).
226///
227/// Includes `refs/heads/*` and peeled `refs/tags/*` so tagged-only commits (e.g. `git tag E` with no
228/// branch) still get a valid source ref. Without tags, `fast-export --all` can fail with
229/// `no ref source for commit` when the walk reaches a commit reachable only via tags.
230fn revision_source_tips(repo: &Repository) -> Result<Vec<(String, ObjectId)>> {
231    let mut tips = refs::list_refs(&repo.git_dir, "refs/heads/")?;
232    for (name, oid) in refs::list_refs(&repo.git_dir, "refs/tags/")? {
233        let tip = match peel_tag_to_commit_oid(repo, oid) {
234            Ok(c) => c,
235            Err(_) => continue,
236        };
237        tips.push((name, tip));
238    }
239    Ok(tips)
240}
241
242fn ref_source_for_commit(
243    repo: &Repository,
244    oid: ObjectId,
245    head_branches: &[(String, ObjectId)],
246) -> Result<String> {
247    let mut best: Option<(&str, (u8, usize))> = None;
248    for (name, tip) in head_branches {
249        if *tip != oid {
250            continue;
251        }
252        let score = (
253            if name.starts_with("refs/heads/") {
254                0
255            } else {
256                1
257            },
258            name.len(),
259        );
260        if best.is_none_or(|(_, s)| score < s) {
261            best = Some((name.as_str(), score));
262        }
263    }
264    if let Some((n, _)) = best {
265        return Ok(n.to_string());
266    }
267    // Propagate first-seen ref name along parents (matches Git `revision_sources`).
268    let mut source: HashMap<ObjectId, String> = HashMap::new();
269    let mut queue: std::collections::VecDeque<ObjectId> = std::collections::VecDeque::new();
270    for (name, tip) in head_branches {
271        if source.insert(*tip, name.clone()).is_none() {
272            queue.push_back(*tip);
273        }
274    }
275    while let Some(c) = queue.pop_front() {
276        let pname = source.get(&c).cloned().unwrap_or_default();
277        let commit = load_commit(repo, c)?;
278        for p in commit.parents {
279            if source.contains_key(&p) {
280                continue;
281            }
282            source.insert(p, pname.clone());
283            queue.push_back(p);
284        }
285    }
286    source
287        .get(&oid)
288        .cloned()
289        .ok_or_else(|| Error::InvalidRef(format!("no ref source for commit {oid}")))
290}
291
292fn load_commit(repo: &Repository, oid: ObjectId) -> Result<CommitData> {
293    let obj = repo.odb.read(&oid)?;
294    if obj.kind != ObjectKind::Commit {
295        return Err(Error::CorruptObject(format!(
296            "expected commit, got {}",
297            obj.kind.as_str()
298        )));
299    }
300    parse_commit(&obj.data)
301}
302
303fn peel_tag_to_commit_oid(repo: &Repository, mut oid: ObjectId) -> Result<ObjectId> {
304    loop {
305        let obj = repo.odb.read(&oid)?;
306        match obj.kind {
307            ObjectKind::Commit => return Ok(oid),
308            ObjectKind::Tag => {
309                let t = parse_tag(&obj.data)?;
310                oid = t.object;
311            }
312            _ => {
313                return Err(Error::CorruptObject(
314                    "tag does not point to a commit".to_owned(),
315                ));
316            }
317        }
318    }
319}
320
321fn depth_first_diff_sort(entries: &mut [DiffEntry]) {
322    entries.sort_by(|a, b| {
323        let pa = a.path();
324        let pb = b.path();
325        let la = pa.len();
326        let lb = pb.len();
327        let minlen = la.min(lb);
328        let cmp = pa.as_bytes()[..minlen].cmp(&pb.as_bytes()[..minlen]);
329        if cmp != std::cmp::Ordering::Equal {
330            return cmp;
331        }
332        let len_cmp = lb.cmp(&la);
333        if len_cmp != std::cmp::Ordering::Equal {
334            return len_cmp;
335        }
336        let ar = matches!(a.status, DiffStatus::Renamed);
337        let br = matches!(b.status, DiffStatus::Renamed);
338        ar.cmp(&br)
339    });
340}
341
342/// Write a fast-import stream for the repository to `writer`.
343///
344/// # Errors
345///
346/// Propagates object database, ref, and revision walk errors.
347pub fn export_stream(
348    repo: &Repository,
349    mut writer: impl Write,
350    options: &FastExportOptions,
351) -> Result<()> {
352    if !options.all {
353        return Err(Error::InvalidRef(
354            "fast-export: only --all is implemented".to_owned(),
355        ));
356    }
357
358    let seeds = if options.anonymize {
359        parse_anonymize_maps(&options.anonymize_maps)?
360    } else {
361        HashMap::new()
362    };
363
364    if !options.anonymize && !options.anonymize_maps.is_empty() {
365        return Err(Error::InvalidRef(
366            "the option '--anonymize-map' requires '--anonymize'".to_owned(),
367        ));
368    }
369
370    let head_branches = revision_source_tips(repo)?;
371
372    let opts = RevListOptions {
373        all_refs: true,
374        ordering: OrderingMode::Topo,
375        reverse: true,
376        ..RevListOptions::default()
377    };
378    let rev_result = rev_list(repo, &[] as &[String], &[] as &[String], &opts)?;
379    let commits: Vec<ObjectId> = rev_result.commits;
380
381    let commit_set: HashSet<ObjectId> = commits.iter().copied().collect();
382
383    let mut marks: HashMap<ObjectId, u32> = HashMap::new();
384    let mut next_mark: u32 = 0;
385
386    let mut anon = if options.anonymize {
387        Some(AnonState::new(&seeds))
388    } else {
389        None
390    };
391
392    if options.use_done_feature {
393        writeln!(writer, "feature done")?;
394    }
395
396    for oid in &commits {
397        let raw_commit = load_commit(repo, *oid)?;
398        let parent_tree = if let Some(p) = raw_commit.parents.first() {
399            let pc = load_commit(repo, *p)?;
400            Some(pc.tree)
401        } else {
402            None
403        };
404        let diffs = diff_trees(&repo.odb, parent_tree.as_ref(), Some(&raw_commit.tree), "")?;
405        let mut diff_vec: Vec<DiffEntry> = diffs
406            .into_iter()
407            .filter(|e| {
408                matches!(
409                    e.status,
410                    DiffStatus::Added
411                        | DiffStatus::Deleted
412                        | DiffStatus::Modified
413                        | DiffStatus::Renamed
414                        | DiffStatus::Copied
415                        | DiffStatus::TypeChanged
416                )
417            })
418            .collect();
419        depth_first_diff_sort(&mut diff_vec);
420
421        if !options.no_data {
422            for e in &diff_vec {
423                if e.status == DiffStatus::Deleted {
424                    continue;
425                }
426                let mode = u32::from_str_radix(e.new_mode.trim(), 8).unwrap_or(0);
427                if mode == MODE_TREE || mode == MODE_GITLINK {
428                    continue;
429                }
430                let blob_oid = e.new_oid;
431                if marks.contains_key(&blob_oid) {
432                    continue;
433                }
434                next_mark += 1;
435                marks.insert(blob_oid, next_mark);
436                writeln!(writer, "blob")?;
437                writeln!(writer, "mark :{next_mark}")?;
438                let payload = if let Some(a) = anon.as_mut() {
439                    a.anonymize_blob_payload()
440                } else {
441                    let o = repo.odb.read(&blob_oid)?;
442                    if o.kind != ObjectKind::Blob {
443                        return Err(Error::CorruptObject("expected blob".to_owned()));
444                    }
445                    o.data
446                };
447                writeln!(writer, "data {}", payload.len())?;
448                writer.write_all(&payload)?;
449                writeln!(writer)?;
450            }
451        }
452
453        let refname = ref_source_for_commit(repo, *oid, &head_branches)?;
454        let export_ref = if let Some(a) = anon.as_mut() {
455            a.anonymize_refname(&refname)
456        } else {
457            refname.clone()
458        };
459
460        if raw_commit.parents.is_empty() {
461            writeln!(writer, "reset {export_ref}")?;
462        }
463
464        next_mark += 1;
465        let commit_mark = next_mark;
466        marks.insert(*oid, commit_mark);
467
468        writeln!(writer, "commit {export_ref}")?;
469        writeln!(writer, "mark :{commit_mark}")?;
470
471        let author_line = if let Some(a) = anon.as_mut() {
472            a.anonymize_ident_line(&format!("author {}", raw_commit.author))
473        } else {
474            format!("author {}", raw_commit.author)
475        };
476        let committer_line = if let Some(a) = anon.as_mut() {
477            a.anonymize_ident_line(&format!("committer {}", raw_commit.committer))
478        } else {
479            format!("committer {}", raw_commit.committer)
480        };
481        writeln!(writer, "{author_line}")?;
482        writeln!(writer, "{committer_line}")?;
483
484        let message = if let Some(a) = anon.as_mut() {
485            a.anonymize_commit_message()
486        } else {
487            raw_commit.message.clone()
488        };
489        let msg_bytes = message.as_bytes();
490        writeln!(writer, "data {}", msg_bytes.len())?;
491        writer.write_all(msg_bytes)?;
492        writeln!(writer)?;
493
494        for (i, p) in raw_commit.parents.iter().enumerate() {
495            let label = if i == 0 { "from" } else { "merge" };
496            write!(writer, "{label} ")?;
497            if let Some(&m) = marks.get(p) {
498                writeln!(writer, ":{m}")?;
499            } else {
500                let hex = p.to_hex();
501                let out = if let Some(a) = anon.as_mut() {
502                    a.anonymize_oid_hex(&hex)
503                } else {
504                    hex
505                };
506                writeln!(writer, "{out}")?;
507            }
508        }
509
510        let mut changed: HashSet<String> = HashSet::new();
511        for e in &diff_vec {
512            match e.status {
513                DiffStatus::Deleted => {
514                    let path = if let Some(a) = anon.as_mut() {
515                        a.anonymize_path(e.path())
516                    } else {
517                        e.path().to_string()
518                    };
519                    writeln!(writer, "D {path}")?;
520                    changed.insert(e.path().to_string());
521                }
522                DiffStatus::Renamed | DiffStatus::Copied => {
523                    let old_p = e.old_path.as_deref().unwrap_or("");
524                    let skip_modify = e.old_oid == e.new_oid
525                        && e.old_mode == e.new_mode
526                        && !changed.contains(old_p);
527                    if !changed.contains(old_p) {
528                        let op = if let Some(a) = anon.as_mut() {
529                            a.anonymize_path(old_p)
530                        } else {
531                            old_p.to_string()
532                        };
533                        let np = if let Some(a) = anon.as_mut() {
534                            a.anonymize_path(e.path())
535                        } else {
536                            e.path().to_string()
537                        };
538                        writeln!(writer, "{} {op} {np}", e.status.letter())?;
539                    }
540                    if !skip_modify {
541                        fallthrough_modify(
542                            repo,
543                            &mut writer,
544                            e,
545                            &marks,
546                            anon.as_mut(),
547                            options.anonymize,
548                            options.no_data,
549                        )?;
550                    }
551                    changed.insert(old_p.to_string());
552                    changed.insert(e.path().to_string());
553                }
554                DiffStatus::Added | DiffStatus::Modified | DiffStatus::TypeChanged => {
555                    fallthrough_modify(
556                        repo,
557                        &mut writer,
558                        e,
559                        &marks,
560                        anon.as_mut(),
561                        options.anonymize,
562                        options.no_data,
563                    )?;
564                    changed.insert(e.path().to_string());
565                }
566                _ => {}
567            }
568        }
569        writeln!(writer)?;
570    }
571
572    // Annotated tags that point at exported commits
573    let tag_refs = refs::list_refs(&repo.git_dir, "refs/tags/")?;
574    for (full_name, tag_oid) in tag_refs {
575        let tag_obj = repo.odb.read(&tag_oid)?;
576        if tag_obj.kind != ObjectKind::Tag {
577            continue;
578        }
579        let tag_data = parse_tag(&tag_obj.data)?;
580        let Ok(target_commit) = peel_tag_to_commit_oid(repo, tag_data.object) else {
581            continue;
582        };
583        if !commit_set.contains(&target_commit) {
584            continue;
585        }
586        let Some(&tip_mark) = marks.get(&target_commit) else {
587            continue;
588        };
589
590        let export_name = if let Some(a) = anon.as_mut() {
591            a.anonymize_refname(&full_name)
592        } else {
593            full_name.clone()
594        };
595        let short_name = export_name
596            .strip_prefix("refs/tags/")
597            .unwrap_or(&export_name)
598            .to_string();
599
600        let tagger_line = if let Some(t) = tag_data.tagger.as_deref() {
601            if let Some(a) = anon.as_mut() {
602                a.anonymize_ident_line(&format!("tagger {t}"))
603            } else {
604                format!("tagger {t}")
605            }
606        } else {
607            String::new()
608        };
609
610        let msg = if options.anonymize {
611            anon.as_mut()
612                .map(|a| a.anonymize_tag_message(&tag_data.message))
613                .unwrap_or_default()
614        } else {
615            tag_data.message.clone()
616        };
617
618        writeln!(writer, "tag {short_name}")?;
619        writeln!(writer, "from :{tip_mark}")?;
620        if !tagger_line.is_empty() {
621            writeln!(writer, "{tagger_line}")?;
622        }
623        let msg_bytes = msg.as_bytes();
624        writeln!(writer, "data {}", msg_bytes.len())?;
625        writer.write_all(msg_bytes)?;
626        writeln!(writer)?;
627    }
628
629    if options.use_done_feature {
630        writeln!(writer, "done")?;
631    }
632
633    Ok(())
634}
635
636fn fallthrough_modify(
637    _repo: &Repository,
638    writer: &mut impl Write,
639    e: &DiffEntry,
640    marks: &HashMap<ObjectId, u32>,
641    mut anon: Option<&mut AnonState>,
642    _anonymize: bool,
643    no_data: bool,
644) -> Result<()> {
645    let mode = u32::from_str_radix(e.new_mode.trim(), 8).unwrap_or(0);
646    let path = if let Some(a) = anon.as_mut() {
647        a.anonymize_path(e.path())
648    } else {
649        e.path().to_string()
650    };
651    if mode == MODE_GITLINK {
652        let hex = e.new_oid.to_hex();
653        let oid_out = if let Some(a) = anon {
654            a.anonymize_oid_hex(&hex)
655        } else {
656            hex
657        };
658        writeln!(writer, "M {:06o} {oid_out} {path}", mode)?;
659        return Ok(());
660    }
661    if no_data {
662        let hex = e.new_oid.to_hex();
663        let oid_out = if let Some(a) = anon.as_mut() {
664            a.anonymize_oid_hex(&hex)
665        } else {
666            hex
667        };
668        writeln!(writer, "M {:06o} {oid_out} {path}", mode)?;
669        return Ok(());
670    }
671    let Some(&bm) = marks.get(&e.new_oid) else {
672        return Err(Error::IndexError(format!(
673            "fast-export: missing mark for blob {}",
674            e.new_oid
675        )));
676    };
677    writeln!(writer, "M {:06o} :{bm} {path}", mode)?;
678    Ok(())
679}