1use serde::Deserialize;
2use std::borrow::Borrow;
3use std::hash::{BuildHasher, Hash};
4use std::io::{Error, ErrorKind, Result};
5use std::marker::PhantomData;
6
7use crate::{
8 MassMapBucketMeta, MassMapDefaultHashLoader, MassMapHashLoader, MassMapHeader, MassMapInfo,
9 MassMapMeta, MassMapReader,
10};
11
12#[derive(Debug)]
25pub struct MassMapInner<R: MassMapReader, H: MassMapHashLoader = MassMapDefaultHashLoader> {
26 pub header: MassMapHeader,
28 pub meta: MassMapMeta,
30 pub(crate) bucket_metas: Vec<MassMapBucketMeta>,
32 build_hasher: H::BuildHasher,
34 pub(crate) reader: R,
36}
37
38pub struct MassMap<K, V, R: MassMapReader, H: MassMapHashLoader = MassMapDefaultHashLoader> {
43 inner: MassMapInner<R, H>,
44 phantom_data: PhantomData<(K, V)>,
46}
47
48impl<R: MassMapReader, H: MassMapHashLoader> MassMapInner<R, H> {
49 pub fn load(reader: R) -> Result<Self> {
55 let header =
56 reader.read_exact_at(0, MassMapHeader::SIZE as u64, MassMapHeader::deserialize)?;
57
58 let (meta, bucket_metas): (MassMapMeta, Vec<MassMapBucketMeta>) =
59 reader.read_exact_at(header.meta_offset, header.meta_length, |data| {
60 rmp_serde::from_slice(data).map_err(|e| {
61 Error::new(
62 ErrorKind::InvalidData,
63 format!("Failed to deserialize MassMapMeta: {}", e),
64 )
65 })
66 })?;
67
68 let build_hasher = H::load(&meta.hash_config)?;
69 Ok(MassMapInner {
70 header,
71 meta,
72 bucket_metas,
73 build_hasher,
74 reader,
75 })
76 }
77
78 pub fn len(&self) -> u64 {
80 self.meta.entry_count
81 }
82
83 pub fn is_empty(&self) -> bool {
85 self.meta.entry_count == 0
86 }
87
88 pub fn info(&self) -> MassMapInfo {
90 MassMapInfo {
91 header: self.header.clone(),
92 meta: self.meta.clone(),
93 }
94 }
95
96 pub fn cast<K, V>(self) -> MassMap<K, V, R, H>
98 where
99 K: for<'de> Deserialize<'de> + Eq + Hash,
100 V: for<'de> Deserialize<'de> + Clone,
101 {
102 MassMap {
103 inner: self,
104 phantom_data: PhantomData,
105 }
106 }
107}
108
109impl<K, V, R: MassMapReader, H: MassMapHashLoader> MassMap<K, V, R, H>
110where
111 K: for<'de> Deserialize<'de> + Eq + Hash,
112 V: for<'de> Deserialize<'de> + Clone,
113{
114 pub fn load(reader: R) -> Result<Self> {
120 let inner = MassMapInner::load(reader)?;
121 Ok(MassMap {
122 inner,
123 phantom_data: PhantomData,
124 })
125 }
126
127 pub fn len(&self) -> u64 {
129 self.inner.len()
130 }
131
132 pub fn is_empty(&self) -> bool {
134 self.inner.is_empty()
135 }
136
137 pub fn bucket_count(&self) -> usize {
141 self.inner.bucket_metas.len()
142 }
143
144 pub(crate) fn meta(&self) -> &MassMapMeta {
148 &self.inner.meta
149 }
150
151 pub(crate) fn bucket_metas(&self) -> &[MassMapBucketMeta] {
153 &self.inner.bucket_metas
154 }
155
156 pub(crate) fn header(&self) -> &MassMapHeader {
158 &self.inner.header
159 }
160
161 pub(crate) fn reader(&self) -> &R {
163 &self.inner.reader
164 }
165
166 pub fn info(&self) -> MassMapInfo {
168 self.inner.info()
169 }
170
171 pub fn get<Q>(&self, k: &Q) -> Result<Option<V>>
181 where
182 K: Borrow<Q>,
183 Q: Eq + Hash + ?Sized,
184 {
185 let index = self.bucket_index(k);
186 let entries = self.get_bucket(index)?;
187 for (key, value) in entries.iter() {
188 if key.borrow() == k {
189 return Ok(Some(value.clone()));
190 }
191 }
192 Ok(None)
193 }
194
195 pub fn batch_get<Q>(
205 &self,
206 keys: impl IntoIterator<Item = impl Borrow<Q>>,
207 ) -> Result<Vec<Option<V>>>
208 where
209 K: Borrow<Q>,
210 Q: Eq + Hash + ?Sized,
211 {
212 let iov = keys.into_iter().map(|key| {
213 let index = self.bucket_index(key.borrow());
214 let bucket = &self.inner.bucket_metas[index];
215 (key, bucket.offset, bucket.length as u64)
216 });
217
218 self.inner.reader.batch_read_at(iov, |expected, data| {
219 if data.is_empty() {
220 return Ok(None);
221 }
222
223 let entries: Vec<(K, V)> = rmp_serde::from_slice(data).map_err(|e| {
224 Error::new(
225 ErrorKind::InvalidData,
226 format!("Failed to deserialize bucket entries: {}", e),
227 )
228 })?;
229
230 for (key, value) in entries.iter() {
231 if key.borrow() == expected.borrow() {
232 return Ok(Some(value.clone()));
233 }
234 }
235 Ok(None)
236 })
237 }
238
239 pub fn iter(&self) -> MassMapIter<'_, K, V, R, H> {
264 MassMapIter {
265 map: self,
266 bucket_index: 0,
267 current_entries: Vec::new().into_iter(),
268 }
269 }
270
271 pub fn get_bucket(&self, index: usize) -> Result<Vec<(K, V)>> {
280 let bucket = &self.inner.bucket_metas[index];
281 if bucket.count == 0 {
282 return Ok(Vec::new());
283 }
284
285 self.inner
286 .reader
287 .read_exact_at(bucket.offset, bucket.length as u64, |data| {
288 let entries: Vec<(K, V)> = rmp_serde::from_slice(data).map_err(|e| {
289 Error::new(
290 ErrorKind::InvalidData,
291 format!("Failed to deserialize bucket entries: {}", e),
292 )
293 })?;
294 Ok(entries)
295 })
296 }
297
298 fn bucket_index<Q>(&self, k: &Q) -> usize
299 where
300 K: Borrow<Q>,
301 Q: Eq + Hash + ?Sized,
302 {
303 (self.inner.build_hasher.hash_one(k) % (self.inner.bucket_metas.len() as u64)) as usize
304 }
305}
306
307pub struct MassMapIter<'a, K, V, R: MassMapReader, H: MassMapHashLoader> {
313 map: &'a MassMap<K, V, R, H>,
314 bucket_index: usize,
315 current_entries: std::vec::IntoIter<(K, V)>,
316}
317
318impl<'a, K, V, R: MassMapReader, H: MassMapHashLoader> Iterator for MassMapIter<'a, K, V, R, H>
319where
320 K: for<'de> Deserialize<'de> + Eq + Hash,
321 V: for<'de> Deserialize<'de> + Clone,
322{
323 type Item = Result<(K, V)>;
324
325 fn next(&mut self) -> Option<Self::Item> {
326 loop {
327 if let Some(entry) = self.current_entries.next() {
329 return Some(Ok(entry));
330 }
331
332 if self.bucket_index >= self.map.inner.bucket_metas.len() {
334 return None;
335 }
336
337 let result = self.map.get_bucket(self.bucket_index);
339 self.bucket_index += 1;
340
341 match result {
342 Ok(entries) => {
343 let vec: Vec<(K, V)> = entries;
344 self.current_entries = vec.into_iter();
345 }
346 Err(e) => return Some(Err(e)),
347 }
348 }
349 }
350}
351
352#[cfg(test)]
353mod tests {
354 use crate::*;
355
356 #[test]
357 fn test_basic() {
358 let dir = tempfile::tempdir().unwrap();
359 let file = dir.path().join("massmap.bin");
360 let writer = std::fs::File::create(&file).unwrap();
361 let entries = vec![
362 ("apple", 1),
363 ("banana", 2),
364 ("cherry", 3),
365 ("date", 4),
366 ("elderberry", 5),
367 ];
368 let builder = MassMapBuilder::default()
369 .with_hash_seed(42)
370 .with_bucket_count(8)
371 .with_writer_buffer_size(8 << 20) .with_field_names(true);
373 let info = builder.build(&writer, entries.iter()).unwrap();
374 assert_eq!(info.meta.entry_count, 5);
375
376 let file = std::fs::File::open(&file).unwrap();
377 assert_eq!(
378 info.header.meta_length + info.header.meta_offset,
379 file.metadata().unwrap().len()
380 );
381 let map = MassMap::<String, i32, _>::load(file).unwrap();
382 assert_eq!(info, map.info());
383 assert_eq!(map.len(), 5);
384 assert!(!map.is_empty());
385 assert_eq!(map.bucket_count(), 8);
386 assert_eq!(
387 map.inner.bucket_metas.iter().map(|b| b.count).sum::<u32>(),
388 5
389 );
390 assert_eq!(map.get("apple").unwrap(), Some(1));
391 assert_eq!(map.get("banana").unwrap(), Some(2));
392 assert_eq!(map.get("steins").unwrap(), None);
393 assert_eq!(map.get("gate").unwrap(), None);
394
395 let keys = vec!["cherry", "date", "fig", "elderberry", "steins", "gate"];
396 let results = map.batch_get::<str>(keys).unwrap();
397 assert_eq!(results, vec![Some(3), Some(4), None, Some(5), None, None]);
398
399 let keys = ["cherry", "date", "fig", "elderberry", "steins", "gate"].map(|s| s.to_string());
400 let results = map.batch_get::<String>(&keys).unwrap();
401 assert_eq!(results, vec![Some(3), Some(4), None, Some(5), None, None]);
402 }
403
404 #[test]
405 fn test_1m() {
406 let dir = tempfile::tempdir().unwrap();
407 let file = dir.path().join("massmap.bin");
408 let writer = std::fs::File::create(&file).unwrap();
409 const N: u64 = 1_000_000;
410 let entries = (0..N).map(|i| (i, i));
411
412 let builder = MassMapBuilder::default()
413 .with_bucket_count(N as u64)
414 .with_writer_buffer_size(8 << 20); builder.build(&writer, entries).unwrap();
416
417 let file = std::fs::File::open(&file).unwrap();
418 println!("massmap file size: {}", file.metadata().unwrap().len());
419
420 let map = MassMap::<u64, u64, _>::load(file).unwrap();
421 assert_eq!(map.len(), N as u64);
422 assert_eq!(map.bucket_count(), N as usize);
423 assert_eq!(
424 map.inner
425 .bucket_metas
426 .iter()
427 .map(|b| b.count as usize)
428 .sum::<usize>(),
429 N as usize
430 );
431
432 for _ in 0..10 {
433 let k = rand::random::<u64>() % N as u64;
434 assert_eq!(map.get(&k).unwrap(), Some(k));
435
436 let k = k + N as u64;
437 assert_eq!(map.get(&k).unwrap(), None);
438 }
439 }
440
441 #[test]
442 fn test_invalid_data() {
443 let dir = tempfile::tempdir().unwrap();
444 let path = dir.path().join("massmap_invalid.bin");
445 let writer = std::fs::File::create(&path).unwrap();
446 const N: u64 = 1000;
447 let entries = (0..N).map(|i| (i, i));
448
449 let builder = MassMapBuilder::default()
450 .with_bucket_count(1)
451 .with_writer_buffer_size(8 << 20); let info = builder.build(&writer, entries).unwrap();
453
454 let file = std::fs::OpenOptions::new()
455 .read(true)
456 .write(true)
457 .open(&path)
458 .unwrap();
459
460 {
461 file.write_all_at(b"invalid data", 24).unwrap();
462 let file = std::fs::File::open(&path).unwrap();
463 let map = MassMap::<u64, u64, _>::load(file).unwrap();
464 map.get(&0).unwrap_err();
465 map.batch_get([0]).unwrap_err();
466 }
467
468 {
469 file.write_all_at(b"invalid data", info.header.meta_offset)
470 .unwrap();
471 let file = std::fs::File::open(&path).unwrap();
472 assert!(MassMap::<u64, u64, _>::load(file).is_err());
473 }
474
475 {
476 file.set_len(info.header.meta_offset + info.header.meta_length - 8)
477 .unwrap();
478 let file = std::fs::File::open(&path).unwrap();
479 assert!(MassMap::<u64, u64, _>::load(file).is_err());
480 }
481
482 {
483 file.write_all_at(b"invalid data", 0).unwrap();
484 let file = std::fs::File::open(&path).unwrap();
485 assert!(MassMap::<u64, u64, _>::load(file).is_err());
486 }
487
488 {
489 let file = std::fs::File::create(&path).unwrap();
490 assert!(MassMap::<u64, u64, _>::load(file).is_err());
491 }
492
493 let writer = std::fs::File::create(&path).unwrap();
494 let builder = MassMapBuilder::default()
495 .with_bucket_count(1)
496 .with_writer_buffer_size(8 << 20)
497 .with_bucket_size_limit(16);
498 let entries = (0..N).map(|i| (i, i));
499 builder.build(&writer, entries).unwrap_err();
500 }
501
502 #[test]
503 fn test_iterator_basic() {
504 let dir = tempfile::tempdir().unwrap();
505 let file = dir.path().join("massmap_iter.bin");
506 let writer = std::fs::File::create(&file).unwrap();
507 let entries = vec![
508 ("apple", 1),
509 ("banana", 2),
510 ("cherry", 3),
511 ("date", 4),
512 ("elderberry", 5),
513 ];
514 let builder = MassMapBuilder::default()
515 .with_hash_seed(42)
516 .with_bucket_count(8);
517 builder.build(&writer, entries.iter()).unwrap();
518
519 let file = std::fs::File::open(&file).unwrap();
520 let map = MassMap::<String, i32, _>::load(file).unwrap();
521
522 let mut collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
524 assert_eq!(collected.len(), 5);
525
526 collected.sort_by(|a, b| a.0.cmp(&b.0));
528 let mut expected = entries
529 .iter()
530 .map(|(k, v)| (k.to_string(), *v))
531 .collect::<Vec<_>>();
532 expected.sort_by(|a, b| a.0.cmp(&b.0));
533 assert_eq!(collected, expected);
534 }
535
536 #[test]
537 fn test_iterator_empty() {
538 let dir = tempfile::tempdir().unwrap();
539 let file = dir.path().join("massmap_iter_empty.bin");
540 let writer = std::fs::File::create(&file).unwrap();
541 let entries: Vec<(String, i32)> = vec![];
542 let builder = MassMapBuilder::default().with_bucket_count(8);
543 builder.build(&writer, entries.iter()).unwrap();
544
545 let file = std::fs::File::open(&file).unwrap();
546 let map = MassMap::<String, i32, _>::load(file).unwrap();
547
548 let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
549 assert_eq!(collected.len(), 0);
550 }
551
552 #[test]
553 fn test_iterator_single_bucket() {
554 let dir = tempfile::tempdir().unwrap();
555 let file = dir.path().join("massmap_iter_single.bin");
556 let writer = std::fs::File::create(&file).unwrap();
557 let entries = vec![("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5)];
558 let builder = MassMapBuilder::default().with_bucket_count(1);
560 builder.build(&writer, entries.iter()).unwrap();
561
562 let file = std::fs::File::open(&file).unwrap();
563 let map = MassMap::<String, i32, _>::load(file).unwrap();
564
565 let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
566 assert_eq!(collected.len(), 5);
567
568 let mut collected_sorted = collected.clone();
570 collected_sorted.sort_by(|a, b| a.0.cmp(&b.0));
571 let mut expected = entries
572 .iter()
573 .map(|(k, v)| (k.to_string(), *v))
574 .collect::<Vec<_>>();
575 expected.sort_by(|a, b| a.0.cmp(&b.0));
576 assert_eq!(collected_sorted, expected);
577 }
578
579 #[test]
580 fn test_iterator_many_buckets() {
581 let dir = tempfile::tempdir().unwrap();
582 let file = dir.path().join("massmap_iter_many.bin");
583 let writer = std::fs::File::create(&file).unwrap();
584 const N: u64 = 1000;
585 let entries = (0..N).map(|i| (i, i * 2));
586 let builder = MassMapBuilder::default().with_bucket_count(100);
588 builder.build(&writer, entries).unwrap();
589
590 let file = std::fs::File::open(&file).unwrap();
591 let map = MassMap::<u64, u64, _>::load(file).unwrap();
592
593 let collected: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
594 assert_eq!(collected.len(), N as usize);
595
596 let mut collected_sorted = collected.clone();
598 collected_sorted.sort_by(|a, b| a.0.cmp(&b.0));
599 for i in 0..N {
600 assert_eq!(collected_sorted[i as usize], (i, i * 2));
601 }
602 }
603
604 #[test]
605 fn test_iterator_multiple_iterations() {
606 let dir = tempfile::tempdir().unwrap();
607 let file = dir.path().join("massmap_iter_multiple.bin");
608 let writer = std::fs::File::create(&file).unwrap();
609 let entries = vec![("x", 10), ("y", 20), ("z", 30)];
610 let builder = MassMapBuilder::default().with_bucket_count(4);
611 builder.build(&writer, entries.iter()).unwrap();
612
613 let file = std::fs::File::open(&file).unwrap();
614 let map = MassMap::<String, i32, _>::load(file).unwrap();
615
616 let collected1: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
618 assert_eq!(collected1.len(), 3);
619
620 let collected2: Vec<_> = map.iter().collect::<std::io::Result<Vec<_>>>().unwrap();
622 assert_eq!(collected2.len(), 3);
623 assert_eq!(collected1, collected2);
624 }
625
626 #[test]
627 fn test_iterator_partial_iteration() {
628 let dir = tempfile::tempdir().unwrap();
629 let file = dir.path().join("massmap_iter_partial.bin");
630 let writer = std::fs::File::create(&file).unwrap();
631 let entries = (0..100).map(|i| (i, i));
632 let builder = MassMapBuilder::default().with_bucket_count(10);
633 builder.build(&writer, entries).unwrap();
634
635 let file = std::fs::File::open(&file).unwrap();
636 let map = MassMap::<u64, u64, _>::load(file).unwrap();
637
638 let partial: Vec<_> = map
640 .iter()
641 .take(10)
642 .collect::<std::io::Result<Vec<_>>>()
643 .unwrap();
644 assert_eq!(partial.len(), 10);
645
646 let skip_take: Vec<_> = map
648 .iter()
649 .skip(20)
650 .take(5)
651 .collect::<std::io::Result<Vec<_>>>()
652 .unwrap();
653 assert_eq!(skip_take.len(), 5);
654 }
655
656 #[test]
657 fn test_iterator_invalid_bucket() {
658 let dir = tempfile::tempdir().unwrap();
659 let path = dir.path().join("massmap_iter_invalid.bin");
660 let writer = std::fs::File::create(&path).unwrap();
661 let entries = (0..100).map(|i| (i, i));
662
663 let builder = MassMapBuilder::default()
664 .with_bucket_count(10)
665 .with_writer_buffer_size(8 << 20);
666 builder.build(&writer, entries).unwrap();
667
668 let file = std::fs::File::open(&path).unwrap();
671 let map = MassMap::<u64, u64, _>::load(file).unwrap();
672
673 for bucket in &map.inner.bucket_metas {
674 if bucket.offset != 24 && bucket.count > 0 {
675 let file = std::fs::OpenOptions::new()
677 .read(true)
678 .write(true)
679 .open(&path)
680 .unwrap();
681 file.write_all_at(b"corrupted bucket", bucket.offset)
682 .unwrap();
683 break;
684 }
685 }
686
687 let mut found_error = false;
689 for result in map.iter() {
690 if result.is_err() {
691 found_error = true;
692 break;
693 }
694 }
695 assert!(found_error);
696 }
697
698 #[test]
699 fn test_massmap_cast() {
700 let dir = tempfile::tempdir().unwrap();
701 let file = dir.path().join("massmap_cast.bin");
702 let writer = std::fs::File::create(&file).unwrap();
703 let entries = vec![
704 ("apple", 1),
705 ("banana", 2),
706 ("cherry", 3),
707 ("date", 4),
708 ("elderberry", 5),
709 ];
710 let builder = MassMapBuilder::default()
711 .with_hash_seed(42)
712 .with_bucket_count(8);
713 builder.build(&writer, entries.iter()).unwrap();
714
715 let file = std::fs::File::open(&file).unwrap();
716 let map = MassMapInner::<_>::load(file).unwrap();
717
718 let casted_map: MassMap<String, i64, _, _> = map.cast();
719 assert_eq!(casted_map.get("apple").unwrap(), Some(1i64));
720 assert_eq!(casted_map.get("banana").unwrap(), Some(2i64));
721 assert_eq!(casted_map.get("steins").unwrap(), None);
722 }
723}