use serde::Deserialize;
use std::borrow::Borrow;
use std::hash::{BuildHasher, Hash};
use std::io::{Error, ErrorKind, Result};
use std::marker::PhantomData;
use crate::{
MassMapBucketMeta, MassMapDefaultHashLoader, MassMapHashLoader, MassMapHeader, MassMapInfo,
MassMapMeta, MassMapReader,
};
#[derive(Debug)]
pub struct MassMapInner<R: MassMapReader, H: MassMapHashLoader = MassMapDefaultHashLoader> {
pub header: MassMapHeader,
pub meta: MassMapMeta,
pub bucket_metas: Vec<MassMapBucketMeta>,
build_hasher: H::BuildHasher,
pub reader: R,
}
pub struct MassMap<K, V, R: MassMapReader, H: MassMapHashLoader = MassMapDefaultHashLoader> {
inner: MassMapInner<R, H>,
phantom_data: PhantomData<(K, V)>,
}
impl<R: MassMapReader, H: MassMapHashLoader> MassMapInner<R, H> {
pub fn load(reader: R) -> Result<Self> {
let header =
reader.read_exact_at(0, MassMapHeader::SIZE as u64, MassMapHeader::deserialize)?;
let (meta, bucket_metas): (MassMapMeta, Vec<MassMapBucketMeta>) =
reader.read_exact_at(header.meta_offset, header.meta_length, |data| {
rmp_serde::from_slice(data).map_err(|e| {
Error::new(
ErrorKind::InvalidData,
format!("Failed to deserialize MassMapMeta: {}", e),
)
})
})?;
let build_hasher = H::load(&meta.hash_config)?;
Ok(MassMapInner {
header,
meta,
bucket_metas,
build_hasher,
reader,
})
}
pub fn len(&self) -> u64 {
self.meta.entry_count
}
pub fn is_empty(&self) -> bool {
self.meta.entry_count == 0
}
pub fn info(&self) -> MassMapInfo {
MassMapInfo {
header: self.header.clone(),
meta: self.meta.clone(),
}
}
pub fn cast<K, V>(self) -> MassMap<K, V, R, H>
where
K: for<'de> Deserialize<'de> + Eq + Hash,
V: for<'de> Deserialize<'de> + Clone,
{
MassMap {
inner: self,
phantom_data: PhantomData,
}
}
}
impl<K, V, R: MassMapReader, H: MassMapHashLoader> MassMap<K, V, R, H>
where
K: for<'de> Deserialize<'de> + Eq + Hash,
V: for<'de> Deserialize<'de> + Clone,
{
pub fn load(reader: R) -> Result<Self> {
let inner = MassMapInner::load(reader)?;
Ok(MassMap {
inner,
phantom_data: PhantomData,
})
}
pub fn len(&self) -> u64 {
self.inner.len()
}
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
pub fn bucket_count(&self) -> usize {
self.inner.bucket_metas.len()
}
pub fn meta(&self) -> &MassMapMeta {
&self.inner.meta
}
pub fn bucket_metas(&self) -> &[MassMapBucketMeta] {
&self.inner.bucket_metas
}
pub fn header(&self) -> &MassMapHeader {
&self.inner.header
}
pub fn reader(&self) -> &R {
&self.inner.reader
}
pub fn info(&self) -> MassMapInfo {
self.inner.info()
}
pub fn get<Q>(&self, k: &Q) -> Result<Option<V>>
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
let index = self.bucket_index(k);
let entries = self.get_bucket(index)?;
for (key, value) in entries.iter() {
if key.borrow() == k {
return Ok(Some(value.clone()));
}
}
Ok(None)
}
pub fn batch_get<Q>(
&self,
keys: impl IntoIterator<Item = impl Borrow<Q>>,
) -> Result<Vec<Option<V>>>
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
let iov = keys.into_iter().map(|key| {
let index = self.bucket_index(key.borrow());
let bucket = &self.inner.bucket_metas[index];
(key, bucket.offset, bucket.length as u64)
});
self.inner.reader.batch_read_at(iov, |expected, data| {
if data.is_empty() {
return Ok(None);
}
let entries: Vec<(K, V)> = rmp_serde::from_slice(data).map_err(|e| {
Error::new(
ErrorKind::InvalidData,
format!("Failed to deserialize bucket entries: {}", e),
)
})?;
for (key, value) in entries.iter() {
if key.borrow() == expected.borrow() {
return Ok(Some(value.clone()));
}
}
Ok(None)
})
}
pub fn iter(&self) -> MassMapIter<'_, K, V, R, H> {
MassMapIter {
map: self,
bucket_index: 0,
current_entries: Vec::new().into_iter(),
}
}
pub fn get_bucket(&self, index: usize) -> Result<Vec<(K, V)>> {
let bucket = &self.inner.bucket_metas[index];
if bucket.count == 0 {
return Ok(Vec::new());
}
self.inner
.reader
.read_exact_at(bucket.offset, bucket.length as u64, |data| {
let entries: Vec<(K, V)> = rmp_serde::from_slice(data).map_err(|e| {
Error::new(
ErrorKind::InvalidData,
format!("Failed to deserialize bucket entries: {}", e),
)
})?;
Ok(entries)
})
}
pub fn bucket_index<Q>(&self, k: &Q) -> usize
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
(self.inner.build_hasher.hash_one(k) % (self.inner.bucket_metas.len() as u64)) as usize
}
}
pub struct MassMapIter<'a, K, V, R: MassMapReader, H: MassMapHashLoader> {
map: &'a MassMap<K, V, R, H>,
bucket_index: usize,
current_entries: std::vec::IntoIter<(K, V)>,
}
impl<'a, K, V, R: MassMapReader, H: MassMapHashLoader> Iterator for MassMapIter<'a, K, V, R, H>
where
K: for<'de> Deserialize<'de> + Eq + Hash,
V: for<'de> Deserialize<'de> + Clone,
{
type Item = Result<(K, V)>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(entry) = self.current_entries.next() {
return Some(Ok(entry));
}
if self.bucket_index >= self.map.inner.bucket_metas.len() {
return None;
}
let result = self.map.get_bucket(self.bucket_index);
self.bucket_index += 1;
match result {
Ok(entries) => {
let vec: Vec<(K, V)> = entries;
self.current_entries = vec.into_iter();
}
Err(e) => return Some(Err(e)),
}
}
}
}
#[cfg(test)]
mod tests {
use crate::*;
#[test]
fn test_basic() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = vec![
("apple", 1),
("banana", 2),
("cherry", 3),
("date", 4),
("elderberry", 5),
];
let builder = MassMapBuilder::default()
.with_hash_seed(42)
.with_bucket_count(8)
.with_writer_buffer_size(8 << 20) .with_field_names(true);
let info = builder.build(&writer, entries.iter()).unwrap();
assert_eq!(info.meta.entry_count, 5);
let file = std::fs::File::open(&file).unwrap();
assert_eq!(
info.header.meta_length + info.header.meta_offset,
file.metadata().unwrap().len()
);
let map = MassMap::<String, i32, _>::load(file).unwrap();
assert_eq!(info, map.info());
assert_eq!(map.len(), 5);
assert!(!map.is_empty());
assert_eq!(map.bucket_count(), 8);
assert_eq!(
map.inner.bucket_metas.iter().map(|b| b.count).sum::<u32>(),
5
);
assert_eq!(map.get("apple").unwrap(), Some(1));
assert_eq!(map.get("banana").unwrap(), Some(2));
assert_eq!(map.get("steins").unwrap(), None);
assert_eq!(map.get("gate").unwrap(), None);
let keys = vec!["cherry", "date", "fig", "elderberry", "steins", "gate"];
let results = map.batch_get::<str>(keys).unwrap();
assert_eq!(results, vec![Some(3), Some(4), None, Some(5), None, None]);
let keys = ["cherry", "date", "fig", "elderberry", "steins", "gate"].map(|s| s.to_string());
let results = map.batch_get::<String>(&keys).unwrap();
assert_eq!(results, vec![Some(3), Some(4), None, Some(5), None, None]);
}
#[test]
fn test_1m() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap.bin");
let writer = std::fs::File::create(&file).unwrap();
const N: u64 = 1_000_000;
let entries = (0..N).map(|i| (i, i));
let builder = MassMapBuilder::default()
.with_bucket_count(N as u64)
.with_writer_buffer_size(8 << 20); builder.build(&writer, entries).unwrap();
let file = std::fs::File::open(&file).unwrap();
println!("massmap file size: {}", file.metadata().unwrap().len());
let map = MassMap::<u64, u64, _>::load(file).unwrap();
assert_eq!(map.len(), N as u64);
assert_eq!(map.bucket_count(), N as usize);
assert_eq!(
map.inner
.bucket_metas
.iter()
.map(|b| b.count as usize)
.sum::<usize>(),
N as usize
);
for _ in 0..10 {
let k = rand::random::<u64>() % N as u64;
assert_eq!(map.get(&k).unwrap(), Some(k));
let k = k + N as u64;
assert_eq!(map.get(&k).unwrap(), None);
}
}
#[test]
fn test_invalid_data() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("massmap_invalid.bin");
let writer = std::fs::File::create(&path).unwrap();
const N: u64 = 1000;
let entries = (0..N).map(|i| (i, i));
let builder = MassMapBuilder::default()
.with_bucket_count(1)
.with_writer_buffer_size(8 << 20); let info = builder.build(&writer, entries).unwrap();
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.open(&path)
.unwrap();
{
file.write_all_at(b"invalid data", 24).unwrap();
let file = std::fs::File::open(&path).unwrap();
let map = MassMap::<u64, u64, _>::load(file).unwrap();
map.get(&0).unwrap_err();
map.batch_get([0]).unwrap_err();
}
{
file.write_all_at(b"invalid data", info.header.meta_offset)
.unwrap();
let file = std::fs::File::open(&path).unwrap();
assert!(MassMap::<u64, u64, _>::load(file).is_err());
}
{
file.set_len(info.header.meta_offset + info.header.meta_length - 8)
.unwrap();
let file = std::fs::File::open(&path).unwrap();
assert!(MassMap::<u64, u64, _>::load(file).is_err());
}
{
file.write_all_at(b"invalid data", 0).unwrap();
let file = std::fs::File::open(&path).unwrap();
assert!(MassMap::<u64, u64, _>::load(file).is_err());
}
{
let file = std::fs::File::create(&path).unwrap();
assert!(MassMap::<u64, u64, _>::load(file).is_err());
}
let writer = std::fs::File::create(&path).unwrap();
let builder = MassMapBuilder::default()
.with_bucket_count(1)
.with_writer_buffer_size(8 << 20)
.with_bucket_size_limit(16);
let entries = (0..N).map(|i| (i, i));
builder.build(&writer, entries).unwrap_err();
}
#[test]
fn test_iterator_basic() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = vec![
("apple", 1),
("banana", 2),
("cherry", 3),
("date", 4),
("elderberry", 5),
];
let builder = MassMapBuilder::default()
.with_hash_seed(42)
.with_bucket_count(8);
builder.build(&writer, entries.iter()).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<String, i32, _>::load(file).unwrap();
let mut collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected.len(), 5);
collected.sort_by(|a, b| a.0.cmp(&b.0));
let mut expected = entries
.iter()
.map(|(k, v)| (k.to_string(), *v))
.collect::<Vec<_>>();
expected.sort_by(|a, b| a.0.cmp(&b.0));
assert_eq!(collected, expected);
}
#[test]
fn test_iterator_empty() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter_empty.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries: Vec<(String, i32)> = vec![];
let builder = MassMapBuilder::default().with_bucket_count(8);
builder.build(&writer, entries.iter()).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<String, i32, _>::load(file).unwrap();
let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected.len(), 0);
}
#[test]
fn test_iterator_single_bucket() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter_single.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = vec![("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5)];
let builder = MassMapBuilder::default().with_bucket_count(1);
builder.build(&writer, entries.iter()).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<String, i32, _>::load(file).unwrap();
let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected.len(), 5);
let mut collected_sorted = collected.clone();
collected_sorted.sort_by(|a, b| a.0.cmp(&b.0));
let mut expected = entries
.iter()
.map(|(k, v)| (k.to_string(), *v))
.collect::<Vec<_>>();
expected.sort_by(|a, b| a.0.cmp(&b.0));
assert_eq!(collected_sorted, expected);
}
#[test]
fn test_iterator_many_buckets() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter_many.bin");
let writer = std::fs::File::create(&file).unwrap();
const N: u64 = 1000;
let entries = (0..N).map(|i| (i, i * 2));
let builder = MassMapBuilder::default().with_bucket_count(100);
builder.build(&writer, entries).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<u64, u64, _>::load(file).unwrap();
let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected.len(), N as usize);
let mut collected_sorted = collected.clone();
collected_sorted.sort_by(|a, b| a.0.cmp(&b.0));
for i in 0..N {
assert_eq!(collected_sorted[i as usize], (i, i * 2));
}
}
#[test]
fn test_iterator_multiple_iterations() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter_multiple.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = vec![("x", 10), ("y", 20), ("z", 30)];
let builder = MassMapBuilder::default().with_bucket_count(4);
builder.build(&writer, entries.iter()).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<String, i32, _>::load(file).unwrap();
let collected1: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected1.len(), 3);
let collected2: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
assert_eq!(collected2.len(), 3);
assert_eq!(collected1, collected2);
}
#[test]
fn test_iterator_partial_iteration() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_iter_partial.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = (0..100).map(|i| (i, i));
let builder = MassMapBuilder::default().with_bucket_count(10);
builder.build(&writer, entries).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMap::<u64, u64, _>::load(file).unwrap();
let partial: Vec<_> = map
.iter()
.take(10)
.collect::<std::io::Result<Vec<_>>>()
.unwrap();
assert_eq!(partial.len(), 10);
let skip_take: Vec<_> = map
.iter()
.skip(20)
.take(5)
.collect::<std::io::Result<Vec<_>>>()
.unwrap();
assert_eq!(skip_take.len(), 5);
}
#[test]
fn test_iterator_invalid_bucket() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("massmap_iter_invalid.bin");
let writer = std::fs::File::create(&path).unwrap();
let entries = (0..100).map(|i| (i, i));
let builder = MassMapBuilder::default()
.with_bucket_count(10)
.with_writer_buffer_size(8 << 20);
builder.build(&writer, entries).unwrap();
let file = std::fs::File::open(&path).unwrap();
let map = MassMap::<u64, u64, _>::load(file).unwrap();
for bucket in &map.inner.bucket_metas {
if bucket.offset != 24 && bucket.count > 0 {
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.open(&path)
.unwrap();
file.write_all_at(b"corrupted bucket", bucket.offset)
.unwrap();
break;
}
}
let mut found_error = false;
for result in map.iter() {
if result.is_err() {
found_error = true;
break;
}
}
assert!(found_error);
}
#[test]
fn test_massmap_cast() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("massmap_cast.bin");
let writer = std::fs::File::create(&file).unwrap();
let entries = vec![
("apple", 1),
("banana", 2),
("cherry", 3),
("date", 4),
("elderberry", 5),
];
let builder = MassMapBuilder::default()
.with_hash_seed(42)
.with_bucket_count(8);
builder.build(&writer, entries.iter()).unwrap();
let file = std::fs::File::open(&file).unwrap();
let map = MassMapInner::<_>::load(file).unwrap();
let casted_map: MassMap<String, i64, _, _> = map.cast();
assert_eq!(casted_map.get("apple").unwrap(), Some(1i64));
assert_eq!(casted_map.get("banana").unwrap(), Some(2i64));
assert_eq!(casted_map.get("steins").unwrap(), None);
}
}