use std::collections::{BTreeSet, HashMap, HashSet};
use std::io::{self, Read, Write};
use git_lfs_pointer::{Oid, Pointer};
use git_lfs_store::Store;
use globset::GlobSet;
use sha2::{Digest, Sha256};
use super::fast_export::{Blob, Command, Commit, DataRef, FileChange, Reader};
use super::fast_import::Writer;
const ATTRS_PATH: &str = ".gitattributes";
const FRESH_MARK_BASE: u32 = 1 << 30;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Mode {
Import,
Export,
Fixup,
}
#[derive(Debug, Clone, Default)]
pub struct Options {
pub include: Option<GlobSet>,
pub exclude: Option<GlobSet>,
pub above: u64,
pub attrs_add_initial: Vec<String>,
pub attrs_remove_initial: Vec<String>,
pub verbose: bool,
pub skip_path_derived_attrs: bool,
pub info_attrs: Vec<u8>,
pub global_attrs: Vec<u8>,
}
#[derive(Debug, Default)]
pub struct Stats {
pub blobs_converted: u64,
pub bytes_converted: u64,
pub commits_seen: u64,
pub patterns: BTreeSet<String>,
pub commit_marks: Vec<(u32, String)>,
}
pub struct Transform<'a> {
store: &'a Store,
opts: Options,
mode: Mode,
blob_buffer: HashMap<u32, Vec<u8>>,
emitted: HashSet<u32>,
next_fresh: u32,
attrs_add: BTreeSet<String>,
attrs_add_initial: Vec<String>,
attrs_remove: BTreeSet<String>,
pub stats: Stats,
}
impl<'a> Transform<'a> {
pub fn new(store: &'a Store, opts: Options, mode: Mode) -> Self {
let attrs_add_initial = opts.attrs_add_initial.clone();
let mut attrs_remove: BTreeSet<String> = BTreeSet::new();
for line in &opts.attrs_remove_initial {
attrs_remove.insert(line.clone());
}
Self {
store,
opts,
mode,
blob_buffer: HashMap::new(),
emitted: HashSet::new(),
next_fresh: FRESH_MARK_BASE,
attrs_add: BTreeSet::new(),
attrs_add_initial,
attrs_remove,
stats: Stats::default(),
}
}
pub fn run<R: Read, W: Write>(mut self, r: R, w: W) -> io::Result<Stats> {
let mut reader = Reader::new(r);
let mut writer = Writer::new(w);
while let Some(cmd) = reader.next()? {
self.process(cmd, &mut writer)?;
}
writer.flush()?;
self.stats.patterns = self.attrs_add.clone();
for p in &self.attrs_add_initial {
self.stats.patterns.insert(p.clone());
}
Ok(self.stats)
}
fn process<W: Write>(&mut self, cmd: Command, writer: &mut Writer<W>) -> io::Result<()> {
match cmd {
Command::Blob(b) => {
if let Some(mark) = b.mark {
self.blob_buffer.insert(mark, b.data);
} else {
writer.write(&Command::Blob(b))?;
}
Ok(())
}
Command::Commit(c) => match self.mode {
Mode::Fixup => self.process_commit_fixup(c, writer),
_ => self.process_commit(c, writer),
},
other => writer.write(&other),
}
}
fn process_commit_fixup<W: Write>(
&mut self,
c: Commit,
writer: &mut Writer<W>,
) -> io::Result<()> {
self.stats.commits_seen += 1;
if let (Some(mark), Some(oid)) = (c.mark, c.original_oid.as_ref()) {
self.stats.commit_marks.push((mark, oid.clone()));
}
let mut attrs_dirs: Vec<(String, Vec<u8>)> = Vec::new();
for ch in &c.file_changes {
match ch {
FileChange::Modify {
dataref: DataRef::Mark(m),
path,
..
} if is_attrs_path(path) => {
if let Some(content) = self.blob_buffer.get(m) {
attrs_dirs.push((dir_of(path), content.clone()));
}
}
FileChange::ModifyInline { path, data, .. } if is_attrs_path(path) => {
attrs_dirs.push((dir_of(path), data.clone()));
}
_ => {}
}
}
attrs_dirs.sort_by_key(|(d, _)| d.matches('/').count());
let mut attrs = git_lfs_git::AttrSet::empty();
if !self.opts.global_attrs.is_empty() {
attrs.add_buffer_at(&self.opts.global_attrs, "");
}
for (dir, content) in &attrs_dirs {
attrs.add_buffer_at(content, dir);
}
if !self.opts.info_attrs.is_empty() {
attrs.add_buffer_at(&self.opts.info_attrs, "");
}
for change in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
mode,
} = change
&& !is_attrs_path(path)
&& mode != "120000"
&& !self.emitted.contains(m)
&& let Some(content) = self.blob_buffer.remove(m)
{
if attrs.is_lfs_tracked(path) {
let (out, _) = self.import_blob(path, content)?;
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: out,
}))?;
} else {
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: content,
}))?;
}
self.emitted.insert(*m);
}
}
for change in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
..
} = change
&& is_attrs_path(path)
&& !self.emitted.contains(m)
&& let Some(content) = self.blob_buffer.remove(m)
{
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: content,
}))?;
self.emitted.insert(*m);
}
}
writer.write(&Command::Commit(c))
}
fn process_commit<W: Write>(
&mut self,
mut c: Commit,
writer: &mut Writer<W>,
) -> io::Result<()> {
self.stats.commits_seen += 1;
if let (Some(mark), Some(oid)) = (c.mark, c.original_oid.as_ref()) {
self.stats.commit_marks.push((mark, oid.clone()));
}
for change in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
mode,
} = change
&& path != ATTRS_PATH
&& mode != "120000"
&& !self.emitted.contains(m)
&& let Some(content) = self.blob_buffer.remove(m)
{
let (out, was_converted) = self.transform_blob(path, content)?;
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: out,
}))?;
self.emitted.insert(*m);
if was_converted {
self.add_pattern_for_path(path);
if self.opts.verbose
&& let Some(oid) = c.original_oid.as_deref()
{
eprintln!(" commit {oid}: {path}");
}
}
}
}
for change in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path: _,
mode,
} = change
&& mode == "120000"
&& !self.emitted.contains(m)
&& let Some(content) = self.blob_buffer.remove(m)
{
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: content,
}))?;
self.emitted.insert(*m);
}
}
let existing_attrs = self.read_existing_attrs(&c);
let new_attrs = build_attrs(
&existing_attrs,
&self.attrs_add,
&self.attrs_add_initial,
&self.attrs_remove,
);
let needs_attrs = !new_attrs.is_empty();
if needs_attrs {
let attrs_mark = self.alloc_fresh();
writer.write(&Command::Blob(Blob {
mark: Some(attrs_mark),
original_oid: None,
data: new_attrs.into_bytes(),
}))?;
replace_or_insert_attrs(&mut c.file_changes, attrs_mark);
}
writer.write(&Command::Commit(c))
}
fn transform_blob(&mut self, path: &str, content: Vec<u8>) -> io::Result<(Vec<u8>, bool)> {
if !path_matches(path, &self.opts.include, &self.opts.exclude) {
return Ok((content, false));
}
match self.mode {
Mode::Import => self.import_blob(path, content),
Mode::Export => self.export_blob(path, content),
Mode::Fixup => Ok((content, false)),
}
}
fn import_blob(&mut self, _path: &str, content: Vec<u8>) -> io::Result<(Vec<u8>, bool)> {
let size = content.len() as u64;
if Pointer::parse(&content).is_ok() {
return Ok((content, false));
}
if size < self.opts.above {
return Ok((content, false));
}
let oid_bytes: [u8; 32] = Sha256::digest(&content).into();
let oid = Oid::from_bytes(oid_bytes);
self.store
.insert_verified(oid, &mut content.as_slice())
.map_err(|e| io::Error::other(format!("storing object: {e}")))?;
let pointer_text = Pointer::new(oid, size).encode().into_bytes();
self.stats.blobs_converted += 1;
self.stats.bytes_converted += size;
Ok((pointer_text, true))
}
fn export_blob(&mut self, _path: &str, content: Vec<u8>) -> io::Result<(Vec<u8>, bool)> {
let pointer = match Pointer::parse(&content) {
Ok(p) => p,
Err(_) => return Ok((content, false)),
};
let mut file = match self.store.open(pointer.oid) {
Ok(f) => f,
Err(e) if e.kind() == io::ErrorKind::NotFound => {
return Ok((content, false));
}
Err(e) => return Err(e),
};
let mut buf = Vec::with_capacity(pointer.size as usize);
std::io::Read::read_to_end(&mut file, &mut buf)?;
self.stats.blobs_converted += 1;
self.stats.bytes_converted += pointer.size;
Ok((buf, true))
}
fn read_existing_attrs(&self, c: &Commit) -> String {
for ch in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
..
} = ch
&& path == ATTRS_PATH
&& let Some(bytes) = self.blob_buffer.get(m)
{
return String::from_utf8_lossy(bytes).into_owned();
}
if let FileChange::ModifyInline { path, data, .. } = ch
&& path == ATTRS_PATH
{
return String::from_utf8_lossy(data).into_owned();
}
}
String::new()
}
fn add_pattern_for_path(&mut self, path: &str) {
if !matches!(self.mode, Mode::Import) {
return;
}
if self.opts.skip_path_derived_attrs {
return;
}
if self.opts.above > 0 {
let escaped = super::import::escape_attr_path(path);
self.attrs_add
.insert(format!("/{escaped} filter=lfs diff=lfs merge=lfs -text"));
return;
}
let leaf = path.rsplit('/').next().unwrap_or(path);
let Some(idx) = leaf.rfind('.') else { return };
if idx == 0 || idx >= leaf.len() - 1 {
return;
}
let ext = &leaf[idx..];
self.attrs_add
.insert(format!("*{ext} filter=lfs diff=lfs merge=lfs -text"));
}
fn alloc_fresh(&mut self) -> u32 {
let m = self.next_fresh;
self.next_fresh += 1;
m
}
}
fn is_attrs_path(path: &str) -> bool {
path == ATTRS_PATH
|| path
.rsplit_once('/')
.is_some_and(|(_, leaf)| leaf == ATTRS_PATH)
}
fn dir_of(path: &str) -> String {
match path.rsplit_once('/') {
Some((parent, _)) => parent.to_owned(),
None => String::new(),
}
}
fn path_matches(path: &str, include: &Option<GlobSet>, exclude: &Option<GlobSet>) -> bool {
if let Some(ex) = exclude
&& ex.is_match(path)
{
return false;
}
match include {
Some(inc) => inc.is_match(path),
None => true,
}
}
fn build_attrs(
existing: &str,
add: &BTreeSet<String>,
add_initial: &[String],
remove: &BTreeSet<String>,
) -> String {
let mut have: HashSet<String> = HashSet::new();
let mut out = String::with_capacity(existing.len() + add.len() * 64);
for line in existing.lines() {
let trimmed = line.trim();
if remove.contains(trimmed) {
continue;
}
out.push_str(line);
out.push('\n');
have.insert(trimmed.to_owned());
}
for p in add {
if have.insert(p.clone()) {
out.push_str(p);
out.push('\n');
}
}
for p in add_initial {
if have.insert(p.clone()) {
out.push_str(p);
out.push('\n');
}
}
out
}
fn replace_or_insert_attrs(changes: &mut Vec<FileChange>, attrs_mark: u32) {
for ch in changes.iter_mut() {
match ch {
FileChange::Modify {
path,
dataref,
mode,
..
} if path == ATTRS_PATH => {
*dataref = DataRef::Mark(attrs_mark);
*mode = "100644".into();
return;
}
FileChange::ModifyInline { path, .. } if path == ATTRS_PATH => {
*ch = FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(attrs_mark),
path: ATTRS_PATH.into(),
};
return;
}
_ => {}
}
}
changes.push(FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(attrs_mark),
path: ATTRS_PATH.into(),
});
}
#[cfg(test)]
mod tests {
use super::*;
use globset::{Glob, GlobSetBuilder};
use tempfile::TempDir;
fn fixture_store() -> (TempDir, Store) {
let tmp = TempDir::new().unwrap();
let store = Store::new(tmp.path().join("lfs"));
(tmp, store)
}
fn glob(pat: &str) -> GlobSet {
let mut b = GlobSetBuilder::new();
b.add(Glob::new(pat).unwrap());
b.build().unwrap()
}
fn run_transform(input: &[u8], opts: Options) -> (Vec<u8>, Stats) {
let (_tmp, store) = fixture_store();
let mut out: Vec<u8> = Vec::new();
let stats = Transform::new(&store, opts, Mode::Import)
.run(input, &mut out)
.unwrap();
(out, stats)
}
fn run_export(input: &[u8], opts: Options, store: &Store) -> (Vec<u8>, Stats) {
let mut out: Vec<u8> = Vec::new();
let stats = Transform::new(store, opts, Mode::Export)
.run(input, &mut out)
.unwrap();
(out, stats)
}
#[test]
fn passes_through_streams_with_no_matching_blobs() {
let input = b"blob\n\
mark :1\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
mark :2\n\
author A <a@b> 1 +0000\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 plain.txt\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (_, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 0);
assert_eq!(stats.commits_seen, 1);
assert!(stats.patterns.is_empty());
}
#[test]
fn converts_matching_blob_to_pointer_and_accumulates_pattern() {
let input = b"blob\n\
mark :1\n\
data 12\n\
hello world\n\
commit refs/heads/main\n\
mark :2\n\
author A <a@b> 1 +0000\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (out, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 1);
assert_eq!(stats.bytes_converted, 12);
assert!(
stats
.patterns
.contains("*.bin filter=lfs diff=lfs merge=lfs -text")
);
let s = String::from_utf8(out).expect("utf-8 stream");
assert!(s.contains("oid sha256:"), "expected pointer text: {s}");
assert!(
s.contains("*.bin filter=lfs diff=lfs merge=lfs -text"),
"expected attrs blob: {s}",
);
assert!(
s.contains(".gitattributes"),
"expected commit to gain a .gitattributes M: {s}",
);
}
#[test]
fn respects_above_threshold() {
let input = b"blob\n\
mark :1\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 a.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 100,
..Default::default()
};
let (_, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn does_not_double_convert_existing_pointer_blob() {
let oid = "30031a9831674dd684c3817399acebc88a116ce5a7a3fbc0cf34d92521a534e6";
let pointer =
format!("version https://git-lfs.github.com/spec/v1\noid sha256:{oid}\nsize 11\n");
let blob_line = format!("data {}\n{pointer}", pointer.len());
let input = format!(
"blob\nmark :1\n{blob_line}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n"
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (_, stats) = run_transform(input.as_bytes(), opts);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn rewrites_existing_gitattributes_with_union() {
let input = b"blob\n\
mark :1\n\
data 16\n\
*.txt diff=text\n\
blob\n\
mark :2\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 .gitattributes\n\
M 100644 :2 a.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (out, _) = run_transform(input, opts);
let s = String::from_utf8(out).unwrap();
assert!(s.contains("*.txt diff=text"), "{s}");
assert!(
s.contains("*.bin filter=lfs diff=lfs merge=lfs -text"),
"{s}",
);
}
#[test]
fn build_attrs_unions_without_duplicating_existing_pattern() {
let existing = "*.bin filter=lfs diff=lfs merge=lfs -text\n*.txt diff=text\n";
let mut add = BTreeSet::new();
add.insert("*.bin filter=lfs diff=lfs merge=lfs -text".to_string());
add.insert("*.png filter=lfs diff=lfs merge=lfs -text".to_string());
let remove = BTreeSet::new();
let out = build_attrs(existing, &add, &[], &remove);
let bin_count = out
.lines()
.filter(|l| *l == "*.bin filter=lfs diff=lfs merge=lfs -text")
.count();
assert_eq!(bin_count, 1, "should not duplicate existing pattern");
assert!(out.contains("*.png filter=lfs"));
}
#[test]
fn build_attrs_drops_removed_patterns() {
let existing = "*.bin filter=lfs diff=lfs merge=lfs -text\n*.txt diff=text\n";
let add = BTreeSet::new();
let mut remove = BTreeSet::new();
remove.insert("*.bin filter=lfs diff=lfs merge=lfs -text".to_string());
let out = build_attrs(existing, &add, &[], &remove);
assert!(
!out.contains("*.bin filter=lfs"),
"removed line still present: {out}"
);
assert!(
out.contains("*.txt diff=text"),
"preserved line missing: {out}"
);
}
#[test]
fn export_expands_pointer_blob_to_real_content() {
let (_tmp, store) = fixture_store();
let real = b"hello world\n";
let (oid, _) = store.insert(&mut real.as_slice()).unwrap();
let pointer = format!(
"version https://git-lfs.github.com/spec/v1\n\
oid sha256:{oid}\n\
size {}\n",
real.len(),
);
let input = format!(
"blob\nmark :1\ndata {n}\n{pointer}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n",
n = pointer.len(),
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
attrs_add_initial: vec!["*.bin !text !filter !merge !diff".into()],
..Default::default()
};
let (out, stats) = run_export(input.as_bytes(), opts, &store);
assert_eq!(stats.blobs_converted, 1);
let s = String::from_utf8_lossy(&out);
assert!(
s.contains("\nhello world\n"),
"expected raw content in stream: {s}"
);
assert!(
!s.contains("oid sha256:"),
"pointer text should be gone: {s}"
);
assert!(
s.contains("*.bin !text !filter !merge !diff"),
"expected un-track line: {s}",
);
}
#[test]
fn export_passes_through_non_pointer_blobs() {
let (_tmp, store) = fixture_store();
let input = b"blob\nmark :1\ndata 5\nhello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 plain.txt\n\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (_, stats) = run_export(input, opts, &store);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn export_leaves_pointer_alone_when_object_missing_from_store() {
let (_tmp, store) = fixture_store();
let oid = "1111111111111111111111111111111111111111111111111111111111111111";
let pointer = format!(
"version https://git-lfs.github.com/spec/v1\n\
oid sha256:{oid}\nsize 5\n",
);
let input = format!(
"blob\nmark :1\ndata {n}\n{pointer}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n",
n = pointer.len(),
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
..Default::default()
};
let (_, stats) = run_export(input.as_bytes(), opts, &store);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn replace_or_insert_attrs_inserts_when_missing() {
let mut changes = vec![FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(7),
path: "data.bin".into(),
}];
replace_or_insert_attrs(&mut changes, 99);
assert_eq!(changes.len(), 2);
match &changes[1] {
FileChange::Modify { path, dataref, .. } => {
assert_eq!(path, ".gitattributes");
assert_eq!(dataref, &DataRef::Mark(99));
}
other => panic!("got {other:?}"),
}
}
#[test]
fn replace_or_insert_attrs_updates_existing_dataref() {
let mut changes = vec![FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(42),
path: ".gitattributes".into(),
}];
replace_or_insert_attrs(&mut changes, 99);
assert_eq!(changes.len(), 1);
match &changes[0] {
FileChange::Modify { dataref, .. } => {
assert_eq!(dataref, &DataRef::Mark(99));
}
other => panic!("got {other:?}"),
}
}
}