use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use sha2::{Digest, Sha256};
use super::objstore::{DynObjectBackend, FsBackend};
pub struct BlobStore {
backend: DynObjectBackend,
}
impl BlobStore {
pub fn new(data_dir: &str) -> Result<Self> {
let root = match std::env::var("TAEL_BLOB_DIR") {
Ok(dir) if !dir.trim().is_empty() => PathBuf::from(dir),
_ => Path::new(data_dir).join("blobs"),
};
Self::with_backend(std::sync::Arc::new(FsBackend::new(root)?))
}
pub fn with_backend(backend: DynObjectBackend) -> Result<Self> {
Ok(Self { backend })
}
pub fn put(&self, content: &[u8]) -> Result<String> {
let hash = hex::encode(Sha256::digest(content));
let key = key_for(&hash);
if self.backend.exists(&key)? {
return Ok(hash);
}
let compressed = snap::raw::Encoder::new()
.compress_vec(content)
.context("compressing blob")?;
self.backend.put(&key, &compressed)?;
Ok(hash)
}
pub fn get(&self, hash: &str) -> Result<Option<Vec<u8>>> {
match self.backend.get(&key_for(hash))? {
Some(compressed) => {
let content = snap::raw::Decoder::new()
.decompress_vec(&compressed)
.context("decompressing blob")?;
Ok(Some(content))
}
None => Ok(None),
}
}
pub fn list_hashes(&self) -> Result<Vec<String>> {
Ok(self
.backend
.list("")?
.iter()
.filter_map(|key| key.rsplit('/').next().map(str::to_string))
.collect())
}
pub fn remove(&self, hash: &str) -> Result<()> {
self.backend.delete(&key_for(hash))
}
pub fn gc(&self, live: &std::collections::HashSet<String>) -> Result<usize> {
let mut removed = 0;
for hash in self.list_hashes()? {
if !live.contains(&hash) {
self.remove(&hash)?;
removed += 1;
}
}
Ok(removed)
}
}
fn key_for(hash: &str) -> String {
if hash.len() >= 4 {
format!("{}/{}/{}", &hash[0..2], &hash[2..4], hash)
} else {
hash.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn store() -> (BlobStore, tempfile::TempDir) {
let dir = tempfile::tempdir().unwrap();
let s = BlobStore::new(dir.path().to_str().unwrap()).unwrap();
(s, dir)
}
#[test]
fn put_then_get_round_trips() {
let (s, _d) = store();
let content = b"You are a helpful assistant.";
let hash = s.put(content).unwrap();
assert_eq!(hash.len(), 64); assert_eq!(s.get(&hash).unwrap().as_deref(), Some(&content[..]));
}
#[test]
fn identical_content_dedups_to_one_file() {
let (s, _d) = store();
let h1 = s.put(b"same system prompt").unwrap();
let h2 = s.put(b"same system prompt").unwrap();
assert_eq!(h1, h2);
assert_eq!(
s.list_hashes().unwrap().len(),
1,
"duplicate content should produce one blob"
);
}
#[test]
fn distinct_content_distinct_hashes() {
let (s, _d) = store();
assert_ne!(s.put(b"prompt a").unwrap(), s.put(b"prompt b").unwrap());
}
#[test]
fn missing_hash_returns_none() {
let (s, _d) = store();
assert!(s.get(&"0".repeat(64)).unwrap().is_none());
}
#[test]
fn gc_removes_only_unreferenced_blobs() {
use std::collections::HashSet;
let (s, _d) = store();
let keep = s.put(b"referenced prompt").unwrap();
let _drop = s.put(b"orphaned prompt").unwrap();
assert_eq!(s.list_hashes().unwrap().len(), 2);
let live: HashSet<String> = [keep.clone()].into_iter().collect();
let removed = s.gc(&live).unwrap();
assert_eq!(removed, 1);
assert_eq!(s.list_hashes().unwrap(), vec![keep.clone()]);
assert!(s.get(&keep).unwrap().is_some());
}
#[test]
fn blob_dir_env_override_relocates_store() {
let data = tempfile::tempdir().unwrap();
let blobs = tempfile::tempdir().unwrap();
unsafe { std::env::set_var("TAEL_BLOB_DIR", blobs.path()) };
let s = BlobStore::new(data.path().to_str().unwrap()).unwrap();
unsafe { std::env::remove_var("TAEL_BLOB_DIR") };
let hash = s.put(b"relocated").unwrap();
assert!(s.get(&hash).unwrap().is_some());
assert!(
!data.path().join("blobs").exists(),
"nothing should be written under data_dir when TAEL_BLOB_DIR is set"
);
}
}