use std::collections::{BTreeSet, HashMap, HashSet};
use std::io::{self, Read, Write};
use git_lfs_pointer::{Oid, Pointer};
use git_lfs_store::Store;
use globset::GlobSet;
use sha2::{Digest, Sha256};
use super::fast_export::{Blob, Command, Commit, DataRef, FileChange, Reader};
use super::fast_import::Writer;
const ATTRS_PATH: &str = ".gitattributes";
const FRESH_MARK_BASE: u32 = 1 << 30;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Mode {
Import,
Export,
}
#[derive(Debug, Clone)]
pub struct Options {
pub include: Option<GlobSet>,
pub exclude: Option<GlobSet>,
pub above: u64,
}
#[derive(Debug, Default)]
pub struct Stats {
pub blobs_converted: u64,
pub bytes_converted: u64,
pub commits_seen: u64,
pub patterns: BTreeSet<String>,
}
pub struct Transform<'a> {
store: &'a Store,
opts: Options,
mode: Mode,
blob_buffer: HashMap<u32, Vec<u8>>,
emitted: HashSet<u32>,
next_fresh: u32,
attrs_add: BTreeSet<String>,
attrs_remove: BTreeSet<String>,
pub stats: Stats,
}
impl<'a> Transform<'a> {
pub fn new(store: &'a Store, opts: Options, mode: Mode) -> Self {
Self {
store,
opts,
mode,
blob_buffer: HashMap::new(),
emitted: HashSet::new(),
next_fresh: FRESH_MARK_BASE,
attrs_add: BTreeSet::new(),
attrs_remove: BTreeSet::new(),
stats: Stats::default(),
}
}
pub fn run<R: Read, W: Write>(
mut self,
r: R,
w: W,
) -> io::Result<Stats> {
let mut reader = Reader::new(r);
let mut writer = Writer::new(w);
while let Some(cmd) = reader.next()? {
self.process(cmd, &mut writer)?;
}
writer.flush()?;
self.stats.patterns = self.attrs_add.clone();
Ok(self.stats)
}
fn process<W: Write>(
&mut self,
cmd: Command,
writer: &mut Writer<W>,
) -> io::Result<()> {
match cmd {
Command::Blob(b) => {
if let Some(mark) = b.mark {
self.blob_buffer.insert(mark, b.data);
} else {
writer.write(&Command::Blob(b))?;
}
Ok(())
}
Command::Commit(c) => self.process_commit(c, writer),
other => writer.write(&other),
}
}
fn process_commit<W: Write>(
&mut self,
mut c: Commit,
writer: &mut Writer<W>,
) -> io::Result<()> {
self.stats.commits_seen += 1;
for change in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
..
} = change
&& path != ATTRS_PATH
&& !self.emitted.contains(m)
&& let Some(content) = self.blob_buffer.remove(m)
{
let (out, was_converted) = self.transform_blob(path, content)?;
writer.write(&Command::Blob(Blob {
mark: Some(*m),
original_oid: None,
data: out,
}))?;
self.emitted.insert(*m);
if was_converted {
self.add_pattern_for_path(path);
}
}
}
let existing_attrs = self.read_existing_attrs(&c);
let new_attrs = build_attrs(&existing_attrs, &self.attrs_add, &self.attrs_remove);
let needs_attrs = !new_attrs.is_empty();
if needs_attrs {
let attrs_mark = self.alloc_fresh();
writer.write(&Command::Blob(Blob {
mark: Some(attrs_mark),
original_oid: None,
data: new_attrs.into_bytes(),
}))?;
replace_or_insert_attrs(&mut c.file_changes, attrs_mark);
}
writer.write(&Command::Commit(c))
}
fn transform_blob(
&mut self,
path: &str,
content: Vec<u8>,
) -> io::Result<(Vec<u8>, bool)> {
if !path_matches(path, &self.opts.include, &self.opts.exclude) {
return Ok((content, false));
}
match self.mode {
Mode::Import => self.import_blob(path, content),
Mode::Export => self.export_blob(path, content),
}
}
fn import_blob(&mut self, _path: &str, content: Vec<u8>) -> io::Result<(Vec<u8>, bool)> {
let size = content.len() as u64;
if Pointer::parse(&content).is_ok() {
return Ok((content, false));
}
if size < self.opts.above {
return Ok((content, false));
}
let oid_bytes: [u8; 32] = Sha256::digest(&content).into();
let oid = Oid::from_bytes(oid_bytes);
self.store
.insert_verified(oid, &mut content.as_slice())
.map_err(|e| io::Error::other(format!("storing object: {e}")))?;
let pointer_text = Pointer::new(oid, size).encode().into_bytes();
self.stats.blobs_converted += 1;
self.stats.bytes_converted += size;
Ok((pointer_text, true))
}
fn export_blob(&mut self, _path: &str, content: Vec<u8>) -> io::Result<(Vec<u8>, bool)> {
let pointer = match Pointer::parse(&content) {
Ok(p) => p,
Err(_) => return Ok((content, false)),
};
let mut file = match self.store.open(pointer.oid) {
Ok(f) => f,
Err(e) if e.kind() == io::ErrorKind::NotFound => {
return Ok((content, false));
}
Err(e) => return Err(e),
};
let mut buf = Vec::with_capacity(pointer.size as usize);
std::io::Read::read_to_end(&mut file, &mut buf)?;
self.stats.blobs_converted += 1;
self.stats.bytes_converted += pointer.size;
Ok((buf, true))
}
fn read_existing_attrs(&self, c: &Commit) -> String {
for ch in &c.file_changes {
if let FileChange::Modify {
dataref: DataRef::Mark(m),
path,
..
} = ch
&& path == ATTRS_PATH
&& let Some(bytes) = self.blob_buffer.get(m)
{
return String::from_utf8_lossy(bytes).into_owned();
}
if let FileChange::ModifyInline { path, data, .. } = ch
&& path == ATTRS_PATH
{
return String::from_utf8_lossy(data).into_owned();
}
}
String::new()
}
fn add_pattern_for_path(&mut self, path: &str) {
let leaf = path.rsplit('/').next().unwrap_or(path);
let Some(idx) = leaf.rfind('.') else { return };
if idx == 0 || idx >= leaf.len() - 1 {
return;
}
let ext = &leaf[idx..];
match self.mode {
Mode::Import => {
self.attrs_add.insert(format!(
"*{ext} filter=lfs diff=lfs merge=lfs -text"
));
}
Mode::Export => {
self.attrs_remove.insert(format!(
"*{ext} filter=lfs diff=lfs merge=lfs -text"
));
self.attrs_add
.insert(format!("*{ext} !text !filter !merge !diff"));
}
}
}
fn alloc_fresh(&mut self) -> u32 {
let m = self.next_fresh;
self.next_fresh += 1;
m
}
}
fn path_matches(
path: &str,
include: &Option<GlobSet>,
exclude: &Option<GlobSet>,
) -> bool {
if let Some(ex) = exclude
&& ex.is_match(path)
{
return false;
}
match include {
Some(inc) => inc.is_match(path),
None => true,
}
}
fn build_attrs(
existing: &str,
add: &BTreeSet<String>,
remove: &BTreeSet<String>,
) -> String {
let mut have: HashSet<String> = HashSet::new();
let mut out = String::with_capacity(existing.len() + add.len() * 64);
for line in existing.lines() {
let trimmed = line.trim();
if remove.contains(trimmed) {
continue;
}
out.push_str(line);
out.push('\n');
have.insert(trimmed.to_owned());
}
for p in add {
if have.insert(p.clone()) {
out.push_str(p);
out.push('\n');
}
}
out
}
fn replace_or_insert_attrs(changes: &mut Vec<FileChange>, attrs_mark: u32) {
for ch in changes.iter_mut() {
match ch {
FileChange::Modify { path, dataref, .. } if path == ATTRS_PATH => {
*dataref = DataRef::Mark(attrs_mark);
return;
}
FileChange::ModifyInline { path, .. } if path == ATTRS_PATH => {
*ch = FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(attrs_mark),
path: ATTRS_PATH.into(),
};
return;
}
_ => {}
}
}
changes.push(FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(attrs_mark),
path: ATTRS_PATH.into(),
});
}
#[cfg(test)]
mod tests {
use super::*;
use globset::{Glob, GlobSetBuilder};
use tempfile::TempDir;
fn fixture_store() -> (TempDir, Store) {
let tmp = TempDir::new().unwrap();
let store = Store::new(tmp.path().join("lfs"));
(tmp, store)
}
fn glob(pat: &str) -> GlobSet {
let mut b = GlobSetBuilder::new();
b.add(Glob::new(pat).unwrap());
b.build().unwrap()
}
fn run_transform(input: &[u8], opts: Options) -> (Vec<u8>, Stats) {
let (_tmp, store) = fixture_store();
let mut out: Vec<u8> = Vec::new();
let stats = Transform::new(&store, opts, Mode::Import)
.run(input, &mut out)
.unwrap();
(out, stats)
}
fn run_export(input: &[u8], opts: Options, store: &Store) -> (Vec<u8>, Stats) {
let mut out: Vec<u8> = Vec::new();
let stats = Transform::new(store, opts, Mode::Export)
.run(input, &mut out)
.unwrap();
(out, stats)
}
#[test]
fn passes_through_streams_with_no_matching_blobs() {
let input = b"blob\n\
mark :1\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
mark :2\n\
author A <a@b> 1 +0000\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 plain.txt\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (_, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 0);
assert_eq!(stats.commits_seen, 1);
assert!(stats.patterns.is_empty());
}
#[test]
fn converts_matching_blob_to_pointer_and_accumulates_pattern() {
let input = b"blob\n\
mark :1\n\
data 12\n\
hello world\n\
commit refs/heads/main\n\
mark :2\n\
author A <a@b> 1 +0000\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (out, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 1);
assert_eq!(stats.bytes_converted, 12);
assert!(stats
.patterns
.contains("*.bin filter=lfs diff=lfs merge=lfs -text"));
let s = String::from_utf8(out).expect("utf-8 stream");
assert!(s.contains("oid sha256:"), "expected pointer text: {s}");
assert!(
s.contains("*.bin filter=lfs diff=lfs merge=lfs -text"),
"expected attrs blob: {s}",
);
assert!(
s.contains(".gitattributes"),
"expected commit to gain a .gitattributes M: {s}",
);
}
#[test]
fn respects_above_threshold() {
let input = b"blob\n\
mark :1\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 a.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 100,
};
let (_, stats) = run_transform(input, opts);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn does_not_double_convert_existing_pointer_blob() {
let oid = "30031a9831674dd684c3817399acebc88a116ce5a7a3fbc0cf34d92521a534e6";
let pointer = format!(
"version https://git-lfs.github.com/spec/v1\noid sha256:{oid}\nsize 11\n"
);
let blob_line = format!("data {}\n{pointer}", pointer.len());
let input = format!(
"blob\nmark :1\n{blob_line}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n"
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (_, stats) = run_transform(input.as_bytes(), opts);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn rewrites_existing_gitattributes_with_union() {
let input = b"blob\n\
mark :1\n\
data 16\n\
*.txt diff=text\n\
blob\n\
mark :2\n\
data 5\n\
hello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 .gitattributes\n\
M 100644 :2 a.bin\n\
\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (out, _) = run_transform(input, opts);
let s = String::from_utf8(out).unwrap();
assert!(s.contains("*.txt diff=text"), "{s}");
assert!(
s.contains("*.bin filter=lfs diff=lfs merge=lfs -text"),
"{s}",
);
}
#[test]
fn build_attrs_unions_without_duplicating_existing_pattern() {
let existing = "*.bin filter=lfs diff=lfs merge=lfs -text\n*.txt diff=text\n";
let mut add = BTreeSet::new();
add.insert("*.bin filter=lfs diff=lfs merge=lfs -text".to_string());
add.insert("*.png filter=lfs diff=lfs merge=lfs -text".to_string());
let remove = BTreeSet::new();
let out = build_attrs(existing, &add, &remove);
let bin_count = out
.lines()
.filter(|l| *l == "*.bin filter=lfs diff=lfs merge=lfs -text")
.count();
assert_eq!(bin_count, 1, "should not duplicate existing pattern");
assert!(out.contains("*.png filter=lfs"));
}
#[test]
fn build_attrs_drops_removed_patterns() {
let existing = "*.bin filter=lfs diff=lfs merge=lfs -text\n*.txt diff=text\n";
let add = BTreeSet::new();
let mut remove = BTreeSet::new();
remove.insert("*.bin filter=lfs diff=lfs merge=lfs -text".to_string());
let out = build_attrs(existing, &add, &remove);
assert!(!out.contains("*.bin filter=lfs"), "removed line still present: {out}");
assert!(out.contains("*.txt diff=text"), "preserved line missing: {out}");
}
#[test]
fn export_expands_pointer_blob_to_real_content() {
let (_tmp, store) = fixture_store();
let real = b"hello world\n";
let (oid, _) = store.insert(&mut real.as_slice()).unwrap();
let pointer = format!(
"version https://git-lfs.github.com/spec/v1\n\
oid sha256:{oid}\n\
size {}\n",
real.len(),
);
let input = format!(
"blob\nmark :1\ndata {n}\n{pointer}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n",
n = pointer.len(),
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (out, stats) = run_export(input.as_bytes(), opts, &store);
assert_eq!(stats.blobs_converted, 1);
let s = String::from_utf8_lossy(&out);
assert!(s.contains("\nhello world\n"), "expected raw content in stream: {s}");
assert!(!s.contains("oid sha256:"), "pointer text should be gone: {s}");
assert!(
s.contains("*.bin !text !filter !merge !diff"),
"expected un-track line: {s}",
);
}
#[test]
fn export_passes_through_non_pointer_blobs() {
let (_tmp, store) = fixture_store();
let input = b"blob\nmark :1\ndata 5\nhello\n\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 plain.txt\n\n";
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (_, stats) = run_export(input, opts, &store);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn export_leaves_pointer_alone_when_object_missing_from_store() {
let (_tmp, store) = fixture_store();
let oid = "1111111111111111111111111111111111111111111111111111111111111111";
let pointer = format!(
"version https://git-lfs.github.com/spec/v1\n\
oid sha256:{oid}\nsize 5\n",
);
let input = format!(
"blob\nmark :1\ndata {n}\n{pointer}\
commit refs/heads/main\n\
committer A <a@b> 1 +0000\n\
data 1\nm\n\
M 100644 :1 data.bin\n\n",
n = pointer.len(),
);
let opts = Options {
include: Some(glob("*.bin")),
exclude: None,
above: 0,
};
let (_, stats) = run_export(input.as_bytes(), opts, &store);
assert_eq!(stats.blobs_converted, 0);
}
#[test]
fn replace_or_insert_attrs_inserts_when_missing() {
let mut changes = vec![FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(7),
path: "data.bin".into(),
}];
replace_or_insert_attrs(&mut changes, 99);
assert_eq!(changes.len(), 2);
match &changes[1] {
FileChange::Modify { path, dataref, .. } => {
assert_eq!(path, ".gitattributes");
assert_eq!(dataref, &DataRef::Mark(99));
}
other => panic!("got {other:?}"),
}
}
#[test]
fn replace_or_insert_attrs_updates_existing_dataref() {
let mut changes = vec![FileChange::Modify {
mode: "100644".into(),
dataref: DataRef::Mark(42),
path: ".gitattributes".into(),
}];
replace_or_insert_attrs(&mut changes, 99);
assert_eq!(changes.len(), 1);
match &changes[0] {
FileChange::Modify { dataref, .. } => {
assert_eq!(dataref, &DataRef::Mark(99));
}
other => panic!("got {other:?}"),
}
}
}