use {
crate::record::Record,
serde::{
Serialize,
Serializer,
},
std::{
collections::{
HashMap,
HashSet,
},
hash::{
BuildHasher,
Hash,
Hasher,
},
},
xxhash_rust::xxh3::{
Xxh3,
xxh3_128,
},
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct ContentHash(pub(crate) u128);
impl Hash for ContentHash {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
state.write_u128(self.0);
}
}
impl Serialize for ContentHash {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.0.to_string())
}
}
impl std::fmt::Display for ContentHash {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:032x}", self.0)
}
}
impl ContentHash {
pub const ZERO: ContentHash = ContentHash(0);
pub fn new(data: &[u8]) -> Self {
let hash = xxh3_128(data);
ContentHash(hash)
}
pub fn from_reader(mut reader: impl std::io::Read) -> std::io::Result<Self> {
let mut hasher = Xxh3::new();
let mut buffer = [0u8; 8192];
loop {
let n = reader.read(&mut buffer)?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
Ok(ContentHash(hasher.digest128()))
}
pub fn hasher() -> ContentHasher {
ContentHasher(Xxh3::new())
}
}
pub struct ContentHasher(Xxh3);
impl ContentHasher {
pub fn update(&mut self, data: &[u8]) {
self.0.update(data);
}
pub fn finalize(self) -> ContentHash {
ContentHash(self.0.digest128())
}
}
impl std::hash::Hasher for ContentHasher {
fn finish(&self) -> u64 {
self.0.digest()
}
fn write(&mut self, bytes: &[u8]) {
self.0.update(bytes);
}
}
#[derive(Default)]
pub struct ContentHashHasher(u64);
impl Hasher for ContentHashHasher {
#[inline(always)]
fn finish(&self) -> u64 {
self.0
}
#[inline(always)]
fn write(&mut self, _bytes: &[u8]) {
debug_assert!(
false,
"ContentHashHasher only accepts write_u128; \
a HashMap lookup will silently miss in release builds"
);
}
#[inline(always)]
fn write_u128(&mut self, i: u128) {
self.0 = (i as u64) ^ ((i >> 64) as u64);
}
}
#[derive(Clone, Debug, Default)]
pub struct ContentHashState;
impl BuildHasher for ContentHashState {
type Hasher = ContentHashHasher;
#[inline(always)]
fn build_hasher(&self) -> Self::Hasher {
ContentHashHasher::default()
}
}
pub type ContentHashMap<V> = HashMap<ContentHash, V, ContentHashState>;
pub type ContentHashSet = HashSet<ContentHash, ContentHashState>;
pub trait ContentHashMapExt {
fn new() -> Self;
fn with_capacity(capacity: usize) -> Self;
}
impl<V> ContentHashMapExt for ContentHashMap<V> {
#[inline(always)]
fn new() -> Self {
Self::with_hasher(ContentHashState)
}
#[inline(always)]
fn with_capacity(capacity: usize) -> Self {
Self::with_capacity_and_hasher(capacity, ContentHashState)
}
}
pub trait ContentHashSetExt {
fn new() -> Self;
fn with_capacity(capacity: usize) -> Self;
}
impl ContentHashSetExt for ContentHashSet {
#[inline(always)]
fn new() -> Self {
Self::with_hasher(ContentHashState)
}
#[inline(always)]
fn with_capacity(capacity: usize) -> Self {
Self::with_capacity_and_hasher(capacity, ContentHashState)
}
}
pub struct ContentHashedWriter<R> {
records: ContentHashMap<R>,
}
impl<R: Record> ContentHashedWriter<R> {
pub fn new() -> Self {
Self {
records: ContentHashMap::new(),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
records: ContentHashMap::with_capacity(capacity),
}
}
pub fn insert(&mut self, record: R) -> ContentHash {
let hash = record.content_hash();
self.records.entry(hash).or_insert(record);
hash
}
pub fn get(&self, hash: &ContentHash) -> Option<&R> {
self.records.get(hash)
}
pub fn contains(&self, hash: &ContentHash) -> bool {
self.records.contains_key(hash)
}
pub fn len(&self) -> usize {
self.records.len()
}
pub fn is_empty(&self) -> bool {
self.records.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = (&ContentHash, &R)> {
self.records.iter()
}
}
impl<R: Record> IntoIterator for ContentHashedWriter<R> {
type Item = (ContentHash, R);
type IntoIter = std::collections::hash_map::IntoIter<ContentHash, R>;
fn into_iter(self) -> Self::IntoIter {
self.records.into_iter()
}
}
impl<R: Record> Default for ContentHashedWriter<R> {
fn default() -> Self {
Self::new()
}
}
pub struct ContentHashedStore<R> {
records: ContentHashMap<R>,
}
impl<R: Record> ContentHashedStore<R> {
pub fn new() -> Self {
Self {
records: ContentHashMap::new(),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
records: ContentHashMap::with_capacity(capacity),
}
}
pub fn merge_from(&mut self, writer: ContentHashedWriter<R>) -> usize {
let mut count = 0;
for (hash, record) in writer {
if self.records.entry(hash).or_insert(record).content_hash() == hash {
count += 1;
}
}
count
}
pub fn insert_with_hash(&mut self, hash: ContentHash, record: R) -> bool {
use std::collections::hash_map::Entry;
match self.records.entry(hash) {
Entry::Occupied(_) => false,
Entry::Vacant(entry) => {
entry.insert(record);
true
}
}
}
pub fn get(&self, hash: &ContentHash) -> Option<&R> {
self.records.get(hash)
}
pub fn contains(&self, hash: &ContentHash) -> bool {
self.records.contains_key(hash)
}
pub fn len(&self) -> usize {
self.records.len()
}
pub fn is_empty(&self) -> bool {
self.records.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = (&ContentHash, &R)> {
self.records.iter()
}
pub fn retain<F>(&mut self, predicate: F)
where
F: FnMut(&ContentHash, &mut R) -> bool,
{
self.records.retain(predicate);
}
pub fn hashes(&self) -> impl Iterator<Item = &ContentHash> {
self.records.keys()
}
}
impl<R: Record> Default for ContentHashedStore<R> {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Debug, Clone, PartialEq, Eq)]
struct TestRecord {
id: u32,
value: String,
}
impl Record for TestRecord {
fn content_hash(&self) -> ContentHash {
let mut hasher = ContentHash::hasher();
hasher.update(&self.id.to_le_bytes());
hasher.update(self.value.as_bytes());
hasher.finalize()
}
}
#[test]
fn test_content_hash_map_new() {
let map: ContentHashMap<i32> = ContentHashMap::new();
assert!(map.is_empty());
}
#[test]
fn test_content_hash_map_with_capacity() {
let map: ContentHashMap<i32> = ContentHashMap::with_capacity(100);
assert!(map.capacity() >= 100);
}
#[test]
fn test_content_hash_set_new() {
let set: ContentHashSet = ContentHashSet::new();
assert!(set.is_empty());
}
#[test]
fn test_content_hash_set_with_capacity() {
let set: ContentHashSet = ContentHashSet::with_capacity(100);
assert!(set.capacity() >= 100);
}
#[test]
fn test_writer_insert_and_retrieve() {
let mut writer = ContentHashedWriter::new();
let record = TestRecord {
id: 1,
value: "test".to_string(),
};
let hash = writer.insert(record.clone());
assert_eq!(writer.len(), 1);
assert!(!writer.is_empty());
assert!(writer.contains(&hash));
let retrieved = writer.get(&hash).unwrap();
assert_eq!(retrieved.id, 1);
assert_eq!(retrieved.value, "test");
}
#[test]
fn test_writer_insert_computes_hash() {
let mut writer = ContentHashedWriter::new();
let record = TestRecord {
id: 42,
value: "hello".to_string(),
};
let expected_hash = record.content_hash();
let returned_hash = writer.insert(record);
assert_eq!(returned_hash, expected_hash);
}
#[test]
fn test_writer_duplicate_insert_keeps_first() {
let mut writer = ContentHashedWriter::new();
let record1 = TestRecord {
id: 1,
value: "test".to_string(),
};
let record2 = TestRecord {
id: 1,
value: "test".to_string(),
};
let hash1 = writer.insert(record1);
let hash2 = writer.insert(record2);
assert_eq!(hash1, hash2);
assert_eq!(writer.len(), 1);
}
#[test]
fn test_writer_iter() {
let mut writer = ContentHashedWriter::new();
writer.insert(TestRecord {
id: 1,
value: "a".to_string(),
});
writer.insert(TestRecord {
id: 2,
value: "b".to_string(),
});
let count = writer.iter().count();
assert_eq!(count, 2);
}
#[test]
fn test_writer_into_iterator() {
let mut writer = ContentHashedWriter::new();
writer.insert(TestRecord {
id: 1,
value: "a".to_string(),
});
writer.insert(TestRecord {
id: 2,
value: "b".to_string(),
});
let items: Vec<_> = writer.into_iter().collect();
assert_eq!(items.len(), 2);
}
#[test]
fn test_writer_default() {
let writer: ContentHashedWriter<TestRecord> = ContentHashedWriter::default();
assert!(writer.is_empty());
}
#[test]
fn test_store_new() {
let store: ContentHashedStore<TestRecord> = ContentHashedStore::new();
assert!(store.is_empty());
assert_eq!(store.len(), 0);
}
#[test]
fn test_store_merge_from_writer() {
let mut writer = ContentHashedWriter::new();
let hash1 = writer.insert(TestRecord {
id: 1,
value: "a".to_string(),
});
let hash2 = writer.insert(TestRecord {
id: 2,
value: "b".to_string(),
});
let mut store = ContentHashedStore::new();
let count = store.merge_from(writer);
assert_eq!(count, 2);
assert_eq!(store.len(), 2);
assert!(store.contains(&hash1));
assert!(store.contains(&hash2));
}
#[test]
fn test_store_merge_duplicate_keeps_first() {
let mut writer1 = ContentHashedWriter::new();
let hash = writer1.insert(TestRecord {
id: 1,
value: "first".to_string(),
});
let mut store = ContentHashedStore::new();
store.merge_from(writer1);
let mut writer2 = ContentHashedWriter::new();
writer2.insert(TestRecord {
id: 1,
value: "first".to_string(),
});
let count = store.merge_from(writer2);
assert_eq!(count, 1);
assert_eq!(store.len(), 1);
let record = store.get(&hash).unwrap();
assert_eq!(record.value, "first");
}
#[test]
fn test_store_insert_with_hash() {
let mut store = ContentHashedStore::new();
let record = TestRecord {
id: 1,
value: "test".to_string(),
};
let hash = record.content_hash();
let inserted = store.insert_with_hash(hash, record);
assert!(inserted);
assert_eq!(store.len(), 1);
let duplicate = TestRecord {
id: 1,
value: "test".to_string(),
};
let inserted_again = store.insert_with_hash(hash, duplicate);
assert!(!inserted_again);
assert_eq!(store.len(), 1);
}
#[test]
fn test_store_retain() {
let mut writer = ContentHashedWriter::new();
writer.insert(TestRecord {
id: 1,
value: "keep".to_string(),
});
writer.insert(TestRecord {
id: 2,
value: "remove".to_string(),
});
writer.insert(TestRecord {
id: 3,
value: "keep".to_string(),
});
let mut store = ContentHashedStore::new();
store.merge_from(writer);
assert_eq!(store.len(), 3);
store.retain(|_, r| r.value == "keep");
assert_eq!(store.len(), 2);
}
#[test]
fn test_store_hashes() {
let mut writer = ContentHashedWriter::new();
let hash1 = writer.insert(TestRecord {
id: 1,
value: "a".to_string(),
});
let hash2 = writer.insert(TestRecord {
id: 2,
value: "b".to_string(),
});
let mut store = ContentHashedStore::new();
store.merge_from(writer);
let hashes: Vec<_> = store.hashes().collect();
assert_eq!(hashes.len(), 2);
assert!(hashes.contains(&&hash1));
assert!(hashes.contains(&&hash2));
}
#[test]
fn test_store_default() {
let store: ContentHashedStore<TestRecord> = ContentHashedStore::default();
assert!(store.is_empty());
}
#[test]
fn test_empty_writer_behavior() {
let writer: ContentHashedWriter<TestRecord> = ContentHashedWriter::new();
assert!(writer.is_empty());
assert_eq!(writer.len(), 0);
assert_eq!(writer.iter().count(), 0);
let nonexistent_hash = ContentHash::new(b"nonexistent");
assert!(!writer.contains(&nonexistent_hash));
assert!(writer.get(&nonexistent_hash).is_none());
}
#[test]
fn test_empty_store_behavior() {
let store: ContentHashedStore<TestRecord> = ContentHashedStore::new();
assert!(store.is_empty());
assert_eq!(store.len(), 0);
assert_eq!(store.iter().count(), 0);
let nonexistent_hash = ContentHash::new(b"nonexistent");
assert!(!store.contains(&nonexistent_hash));
assert!(store.get(&nonexistent_hash).is_none());
}
#[test]
fn test_merge_empty_writer() {
let writer: ContentHashedWriter<TestRecord> = ContentHashedWriter::new();
let mut store = ContentHashedStore::new();
let count = store.merge_from(writer);
assert_eq!(count, 0);
assert!(store.is_empty());
}
}