use std::collections::BTreeMap;
use std::io;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use crate::manifest::ChecksumAlgo;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
#[non_exhaustive]
pub enum DedupeStrategy {
#[default]
None,
Blanks,
All {
algo: ChecksumAlgo,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum DedupeDecision {
WriteNew {
shared_key: String,
shared_path: PathBuf,
},
Reference {
shared_key: String,
shared_path: PathBuf,
},
}
type AlgoTag = u8;
const ALGO_TAG_BLAKE3: AlgoTag = 1;
const ALGO_TAG_SHA256: AlgoTag = 2;
fn algo_tag(a: ChecksumAlgo) -> AlgoTag {
match a {
ChecksumAlgo::Blake3 => ALGO_TAG_BLAKE3,
ChecksumAlgo::Sha256 => ALGO_TAG_SHA256,
}
}
#[derive(Debug, Default)]
struct DedupeState {
seen: BTreeMap<(AlgoTag, [u8; 32]), String>,
refs: BTreeMap<String, String>,
}
#[non_exhaustive]
pub struct DedupeIndex {
strategy: DedupeStrategy,
state: Mutex<DedupeState>,
}
impl DedupeIndex {
pub fn new(strategy: DedupeStrategy) -> Self {
Self {
strategy,
state: Mutex::new(DedupeState::default()),
}
}
pub fn strategy(&self) -> DedupeStrategy {
self.strategy
}
fn effective_algo(&self) -> ChecksumAlgo {
match self.strategy {
DedupeStrategy::None | DedupeStrategy::Blanks => ChecksumAlgo::Blake3,
DedupeStrategy::All { algo } => algo,
}
}
fn hash_content_raw(&self, bytes: &[u8]) -> (ChecksumAlgo, [u8; 32]) {
let algo = self.effective_algo();
let digest = match algo {
ChecksumAlgo::Blake3 => {
let h = blake3::hash(bytes);
*h.as_bytes()
}
ChecksumAlgo::Sha256 => {
use sha2::Digest;
let mut hasher = sha2::Sha256::new();
hasher.update(bytes);
let out = hasher.finalize();
let mut buf = [0u8; 32];
buf.copy_from_slice(&out);
buf
}
};
(algo, digest)
}
pub fn record(&self, path: &str, bytes: &[u8]) -> DedupeDecision {
let ext = Path::new(path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("bin");
let (algo, digest) = self.hash_content_raw(bytes);
let hash_hex = hex_lower(&digest);
let shared_key = format!("blank_{hash_hex}");
let shared_path = Self::shared_path(&shared_key, ext);
let mut state = self.state.lock().expect("dedupe state mutex poisoned");
state.refs.insert(path.to_string(), shared_key.clone());
match state.seen.entry((algo_tag(algo), digest)) {
std::collections::btree_map::Entry::Vacant(e) => {
e.insert(shared_key.clone());
DedupeDecision::WriteNew {
shared_key,
shared_path,
}
}
std::collections::btree_map::Entry::Occupied(_) => DedupeDecision::Reference {
shared_key,
shared_path,
},
}
}
pub fn forget_reference(&self, path: &str) {
let mut state = self.state.lock().expect("dedupe state mutex poisoned");
state.refs.remove(path);
}
pub fn seed_shared_key(&self, hash_hex: String, shared_key: String) {
let digest = match decode_hex_32(&hash_hex) {
Some(d) => d,
None => return,
};
let algo = self.effective_algo();
let mut state = self.state.lock().expect("dedupe state mutex poisoned");
state.seen.insert((algo_tag(algo), digest), shared_key);
}
pub fn shared_path(shared_key: &str, ext: &str) -> PathBuf {
let mut p = PathBuf::from("_shared");
p.push(format!("{shared_key}.{ext}"));
p
}
pub fn references(&self) -> BTreeMap<String, String> {
self.state
.lock()
.expect("dedupe state mutex poisoned")
.refs
.clone()
}
pub fn distinct_count(&self) -> usize {
self.state
.lock()
.expect("dedupe state mutex poisoned")
.seen
.len()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum LinkResult {
Symlink,
Hardlink,
ManifestOnly,
}
const PLACEHOLDER_MARKER: [u8; 1] = [0u8];
#[cfg(unix)]
fn try_symlink(target: &Path, link: &Path) -> io::Result<()> {
let target_for_link = symlink_target_for(target, link);
std::os::unix::fs::symlink(&target_for_link, link)
}
#[cfg(windows)]
fn try_symlink(target: &Path, link: &Path) -> io::Result<()> {
std::os::windows::fs::symlink_file(target, link)
}
#[cfg(not(any(unix, windows)))]
fn try_symlink(_target: &Path, _link: &Path) -> io::Result<()> {
Err(io::Error::new(
io::ErrorKind::Unsupported,
"symlinks are not supported on this platform",
))
}
#[cfg(any(unix, windows))]
fn try_hardlink(target: &Path, link: &Path) -> io::Result<()> {
std::fs::hard_link(target, link)
}
#[cfg(not(any(unix, windows)))]
fn try_hardlink(target: &Path, link: &Path) -> io::Result<()> {
std::fs::hard_link(target, link)
}
#[cfg(unix)]
fn symlink_target_for(target: &Path, link: &Path) -> PathBuf {
let (Some(link_parent), true) = (link.parent(), target.is_absolute()) else {
return target.to_path_buf();
};
if !link_parent.is_absolute() {
return target.to_path_buf();
}
match pathdiff(target, link_parent) {
Some(rel) => rel,
None => target.to_path_buf(),
}
}
#[cfg(unix)]
fn pathdiff(to: &Path, from: &Path) -> Option<PathBuf> {
use std::path::Component;
let to_components: Vec<Component<'_>> = to.components().collect();
let from_components: Vec<Component<'_>> = from.components().collect();
let mut i = 0;
while i < to_components.len() && i < from_components.len() {
if to_components[i] != from_components[i] {
break;
}
i += 1;
}
if i == 0 {
return None;
}
let mut rel = PathBuf::new();
for _ in i..from_components.len() {
rel.push("..");
}
for comp in &to_components[i..] {
rel.push(comp.as_os_str());
}
if rel.as_os_str().is_empty() {
rel.push(".");
}
Some(rel)
}
pub fn materialize_reference(tile_path: &Path, shared_path: &Path) -> LinkResult {
if tile_path.exists() || tile_path.is_symlink() {
let _ = std::fs::remove_file(tile_path);
}
#[cfg(windows)]
{
if try_hardlink(shared_path, tile_path).is_ok() {
return LinkResult::Hardlink;
}
if try_symlink(shared_path, tile_path).is_ok() {
return LinkResult::Symlink;
}
}
#[cfg(not(windows))]
{
if try_symlink(shared_path, tile_path).is_ok() {
return LinkResult::Symlink;
}
if try_hardlink(shared_path, tile_path).is_ok() {
return LinkResult::Hardlink;
}
}
match std::fs::write(tile_path, PLACEHOLDER_MARKER) {
Ok(()) => LinkResult::ManifestOnly,
Err(_) => {
LinkResult::ManifestOnly
}
}
}
fn hex_lower(bytes: &[u8]) -> String {
const HEX: &[u8; 16] = b"0123456789abcdef";
let mut s = String::with_capacity(bytes.len() * 2);
for &b in bytes {
s.push(HEX[(b >> 4) as usize] as char);
s.push(HEX[(b & 0x0f) as usize] as char);
}
s
}
fn decode_hex_32(s: &str) -> Option<[u8; 32]> {
let bytes = s.as_bytes();
if bytes.len() != 64 {
return None;
}
let mut out = [0u8; 32];
for (i, chunk) in bytes.chunks_exact(2).enumerate() {
let hi = hex_nibble(chunk[0])?;
let lo = hex_nibble(chunk[1])?;
out[i] = (hi << 4) | lo;
}
Some(out)
}
fn hex_nibble(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(10 + (b - b'a')),
b'A'..=b'F' => Some(10 + (b - b'A')),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_strategy_is_none() {
assert_eq!(DedupeStrategy::default(), DedupeStrategy::None);
}
#[test]
fn shared_path_is_under_shared_dir() {
let p = DedupeIndex::shared_path("blank_deadbeef", "png");
assert_eq!(p, PathBuf::from("_shared").join("blank_deadbeef.png"));
}
#[test]
fn record_first_returns_write_new_second_returns_reference() {
let idx = DedupeIndex::new(DedupeStrategy::Blanks);
let bytes = b"hello world";
let d1 = idx.record("0/0_0.png", bytes);
let d2 = idx.record("0/0_1.png", bytes);
match d1 {
DedupeDecision::WriteNew { shared_key, .. } => {
assert!(shared_key.starts_with("blank_"));
}
_ => panic!("first record must be WriteNew"),
}
match d2 {
DedupeDecision::Reference { shared_key, .. } => {
assert!(shared_key.starts_with("blank_"));
}
_ => panic!("second record must be Reference"),
}
}
#[test]
fn distinct_contents_produce_distinct_shared_keys() {
let idx = DedupeIndex::new(DedupeStrategy::Blanks);
let d1 = idx.record("a.png", b"aaaa");
let d2 = idx.record("b.png", b"bbbb");
let k1 = match d1 {
DedupeDecision::WriteNew { shared_key, .. } => shared_key,
_ => unreachable!(),
};
let k2 = match d2 {
DedupeDecision::WriteNew { shared_key, .. } => shared_key,
_ => unreachable!(),
};
assert_ne!(k1, k2);
}
#[test]
fn references_map_captures_all_recorded_paths() {
let idx = DedupeIndex::new(DedupeStrategy::Blanks);
idx.record("a.png", b"xxxx");
idx.record("b.png", b"xxxx");
idx.record("c.png", b"yyyy");
let refs = idx.references();
assert_eq!(refs.len(), 3);
assert_eq!(refs["a.png"], refs["b.png"]);
assert_ne!(refs["a.png"], refs["c.png"]);
}
#[test]
fn forget_reference_removes_manifest_entry() {
let idx = DedupeIndex::new(DedupeStrategy::Blanks);
idx.record("a.png", b"xxxx");
idx.forget_reference("a.png");
assert!(idx.references().is_empty());
}
#[test]
fn all_strategy_honours_custom_algo() {
let idx_blake = DedupeIndex::new(DedupeStrategy::All {
algo: ChecksumAlgo::Blake3,
});
let idx_sha = DedupeIndex::new(DedupeStrategy::All {
algo: ChecksumAlgo::Sha256,
});
let key_blake = match idx_blake.record("x.png", b"content") {
DedupeDecision::WriteNew { shared_key, .. } => shared_key,
_ => unreachable!(),
};
let key_sha = match idx_sha.record("x.png", b"content") {
DedupeDecision::WriteNew { shared_key, .. } => shared_key,
_ => unreachable!(),
};
assert_ne!(
key_blake, key_sha,
"different algos must produce different shared keys"
);
}
#[test]
#[cfg_attr(miri, ignore)] fn materialize_reference_creates_a_readable_tile() {
let tmp = tempfile::tempdir().unwrap();
let shared_dir = tmp.path().join("_shared");
std::fs::create_dir_all(&shared_dir).unwrap();
let shared = shared_dir.join("blank_abc.png");
std::fs::write(&shared, b"SHAREDCONTENT").unwrap();
let tile = tmp.path().join("0").join("0_0.png");
std::fs::create_dir_all(tile.parent().unwrap()).unwrap();
let result = materialize_reference(&tile, &shared);
assert!(matches!(
result,
LinkResult::Symlink | LinkResult::Hardlink | LinkResult::ManifestOnly
));
assert!(tile.exists() || tile.is_symlink());
if matches!(result, LinkResult::Symlink | LinkResult::Hardlink) {
let read_back = std::fs::read(&tile).unwrap();
assert_eq!(read_back, b"SHAREDCONTENT");
} else {
let md = std::fs::metadata(&tile).unwrap();
assert_eq!(md.len(), 1);
}
}
#[test]
fn seed_shared_key_suppresses_later_write_new() {
let idx = DedupeIndex::new(DedupeStrategy::Blanks);
let hash = {
let h = blake3::hash(b"payload");
hex_lower(h.as_bytes())
};
idx.seed_shared_key(hash.clone(), format!("blank_{hash}"));
let decision = idx.record("a.png", b"payload");
assert!(matches!(decision, DedupeDecision::Reference { .. }));
}
#[test]
fn decode_hex_32_roundtrip() {
let bytes = [
0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd,
0xee, 0xff, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe, 0x01, 0x02, 0x03, 0x04,
0x05, 0x06, 0x07, 0x08,
];
let hex = hex_lower(&bytes);
let back = decode_hex_32(&hex).expect("valid hex round-trips");
assert_eq!(back, bytes);
}
#[test]
fn decode_hex_32_rejects_bad_length() {
assert!(decode_hex_32("").is_none());
assert!(decode_hex_32("abcd").is_none());
}
}