xet-core-structures 1.5.2

Core data structures including MerkleHash, metadata shards, and Xorb objects.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
use std::collections::HashSet;
use std::io::{Cursor, Read, Write, copy};
use std::mem::size_of;
use std::time::SystemTime;

use bytes::Bytes;
use futures::AsyncRead;
use futures_util::io::AsyncReadExt;
use itertools::Itertools;
use more_asserts::debug_assert_lt;

use super::file_structs::{FileDataSequenceHeader, MDBFileInfoView};
use super::shard_file::{MDB_FILE_INFO_ENTRY_SIZE, current_timestamp};
use super::xorb_structs::{MDBXorbInfoView, XorbChunkSequenceEntry, XorbChunkSequenceHeader};
use super::{MDBShardFileFooter, MDBShardFileHeader};
use crate::MerkleHashMap;
use crate::error::{CoreError, Result};
use crate::merklehash::MerkleHash;

/// Runs through a shard file info section, calling the specified callback function for each entry.
///
/// Assumes that the reader is at the start of the file info section, and on return, the
/// reader will be at the end of the file info section.
pub fn process_shard_file_info_section<R: Read, FileFunc>(reader: &mut R, mut file_callback: FileFunc) -> Result<()>
where
    FileFunc: FnMut(MDBFileInfoView) -> Result<()>,
{
    // Iterate through the file metadata section, calling the file callback function for each one.
    loop {
        let header = FileDataSequenceHeader::deserialize(reader)?;

        if header.is_bookend() {
            break;
        }

        let n = header.num_entries as usize;

        let mut n_entries = n;

        if header.contains_verification() {
            n_entries += n;
        }

        if header.contains_metadata_ext() {
            n_entries += 1;
        }

        let n_bytes = n_entries * MDB_FILE_INFO_ENTRY_SIZE;

        let mut file_data = Vec::with_capacity(size_of::<FileDataSequenceHeader>() + n_bytes);

        header.serialize(&mut file_data)?;
        copy(&mut reader.take(n_bytes as u64), &mut file_data)?;

        file_callback(MDBFileInfoView::from_data_and_header(header, Bytes::from(file_data))?)?;
    }

    Ok(())
}

/// Runs through a shard xorb info section and processes each entry, calling the
/// specified callback function for each entry.
///
/// Assumes that the reader is at the start of the xorb info section, and on return, the
/// reader will be at the end of the xorb info section.
pub fn process_shard_xorb_info_section<R: Read, XorbFunc>(reader: &mut R, mut xorb_callback: XorbFunc) -> Result<()>
where
    XorbFunc: FnMut(MDBXorbInfoView) -> Result<()>,
{
    loop {
        let header = XorbChunkSequenceHeader::deserialize(reader)?;

        if header.is_bookend() {
            break;
        }

        let n_bytes = (header.num_entries as usize) * size_of::<XorbChunkSequenceEntry>();

        let mut xorb_data = Vec::with_capacity(size_of::<XorbChunkSequenceHeader>() + n_bytes);

        header.serialize(&mut xorb_data)?;
        copy(&mut reader.take(n_bytes as u64), &mut xorb_data)?;

        xorb_callback(MDBXorbInfoView::from_data_and_header(header, Bytes::from(xorb_data))?)?;
    }
    Ok(())
}

// Async versions of the above

pub async fn process_shard_file_info_section_async<R: AsyncRead + Unpin, FileFunc>(
    reader: &mut R,
    mut file_callback: FileFunc,
) -> Result<()>
where
    FileFunc: FnMut(MDBFileInfoView) -> Result<()>,
{
    loop {
        // Read header
        let mut header_buf = [0u8; size_of::<FileDataSequenceHeader>()];

        reader.read_exact(&mut header_buf).await?;

        let header = FileDataSequenceHeader::deserialize(&mut Cursor::new(&header_buf[..]))?;
        if header.is_bookend() {
            break;
        }

        let n = header.num_entries as usize;
        let mut n_entries = n;

        if header.contains_verification() {
            n_entries += n;
        }

        if header.contains_metadata_ext() {
            n_entries += 1;
        }

        let n_bytes = n_entries * MDB_FILE_INFO_ENTRY_SIZE;
        let total_len = size_of::<FileDataSequenceHeader>() + n_bytes;

        // Prepare buffer for entire record: header + data
        let mut file_data = Vec::with_capacity(total_len);
        file_data.extend_from_slice(&header_buf); // put header data first
        file_data.resize(total_len, 0); // enlarge to full size

        // Read the remainder of the data
        reader.read_exact(&mut file_data[size_of::<FileDataSequenceHeader>()..]).await?;

        // Call the callback with the assembled view
        file_callback(MDBFileInfoView::from_data_and_header(header, Bytes::from(file_data))?)?;
    }

    Ok(())
}

pub async fn process_shard_xorb_info_section_async<R: AsyncRead + Unpin, XorbFunc>(
    reader: &mut R,
    mut xorb_callback: XorbFunc,
) -> Result<()>
where
    XorbFunc: FnMut(MDBXorbInfoView) -> Result<()>,
{
    loop {
        // Read header
        let mut header_buf = [0u8; size_of::<XorbChunkSequenceHeader>()];
        reader.read_exact(&mut header_buf).await?;

        let header = XorbChunkSequenceHeader::deserialize(&mut Cursor::new(&header_buf[..]))?;
        if header.is_bookend() {
            break;
        }

        let n_bytes = (header.num_entries as usize) * size_of::<XorbChunkSequenceEntry>();
        let total_len = size_of::<XorbChunkSequenceHeader>() + n_bytes;

        let mut xorb_data = Vec::with_capacity(total_len);
        xorb_data.extend_from_slice(&header_buf); // Insert the header we read
        xorb_data.resize(total_len, 0);

        // Read the remainder of the XORB chunk data
        reader
            .read_exact(&mut xorb_data[size_of::<XorbChunkSequenceHeader>()..])
            .await?;

        // Invoke callback
        xorb_callback(MDBXorbInfoView::from_data_and_header(header, Bytes::from(xorb_data))?)?;
    }

    Ok(())
}

// A minimal shard loaded in memory that could be useful by themselves.  In addition, this provides a testing surface
// for the above iteration routines.
#[derive(Clone, Debug, PartialEq)]
pub struct MDBMinimalShard {
    file_info_views: Vec<MDBFileInfoView>,
    xorb_info_views: Vec<MDBXorbInfoView>,
}

impl MDBMinimalShard {
    pub fn from_reader<R: Read>(reader: &mut R, include_files: bool, include_xorb: bool) -> Result<Self> {
        // Check the header; not needed except for version verification.
        let _ = MDBShardFileHeader::deserialize(reader)?;

        let mut file_info_views = Vec::<MDBFileInfoView>::new();
        process_shard_file_info_section(reader, |fiv: MDBFileInfoView| {
            // register the offset here to the file entries
            if include_files {
                file_info_views.push(fiv);
            }
            Ok(())
        })?;

        let mut xorb_info_views = Vec::<MDBXorbInfoView>::new();
        if include_xorb {
            process_shard_xorb_info_section(reader, |civ: MDBXorbInfoView| {
                xorb_info_views.push(civ);
                Ok(())
            })?;
        }

        Ok(Self {
            file_info_views,
            xorb_info_views,
        })
    }

    pub async fn from_reader_async<R: AsyncRead + Unpin>(
        reader: &mut R,
        include_files: bool,
        include_xorb: bool,
    ) -> Result<Self> {
        Self::from_reader_async_with_custom_callbacks(reader, include_files, include_xorb, |_| Ok(()), |_| Ok(())).await
    }

    pub async fn from_reader_async_with_custom_callbacks<R: AsyncRead + Unpin, FileFunc, XorbFunc>(
        reader: &mut R,
        include_files: bool,
        include_xorb: bool,
        mut file_callback: FileFunc,
        mut xorb_callback: XorbFunc,
    ) -> Result<Self>
    where
        FileFunc: FnMut(&MDBFileInfoView) -> Result<()>,
        XorbFunc: FnMut(&MDBXorbInfoView) -> Result<()>,
    {
        // Check the header; not needed except for version verification.
        let mut buf = [0u8; size_of::<MDBShardFileHeader>()];
        reader.read_exact(&mut buf[..]).await?;
        let _ = MDBShardFileHeader::deserialize(&mut Cursor::new(&buf))?;

        let mut file_info_views = Vec::<MDBFileInfoView>::new();
        process_shard_file_info_section_async(reader, |fiv: MDBFileInfoView| {
            // register the offset here to the file entries
            if include_files {
                file_callback(&fiv)?;
                file_info_views.push(fiv);
            }
            Ok(())
        })
        .await?;
        // if only some files have verification, then we consider this shard invalid
        // either all files have verification or no files have verification
        if !file_info_views.is_empty() && !file_info_views.iter().map(|fiv| fiv.contains_verification()).all_equal() {
            return Err(CoreError::invalid_shard("only some files contain verification"));
        }

        // XORB stuff
        let mut xorb_info_views = Vec::<MDBXorbInfoView>::new();
        if include_xorb {
            process_shard_xorb_info_section_async(reader, |civ: MDBXorbInfoView| {
                xorb_callback(&civ)?;
                xorb_info_views.push(civ);
                Ok(())
            })
            .await?;
        }

        Ok(Self {
            file_info_views,
            xorb_info_views,
        })
    }

    pub fn has_file_verification(&self) -> bool {
        let Some(file_info_view) = self.file_info_views.first() else {
            return false;
        };
        file_info_view.contains_verification()
    }

    pub fn num_files(&self) -> usize {
        self.file_info_views.len()
    }

    pub fn file(&self, index: usize) -> Option<&MDBFileInfoView> {
        self.file_info_views.get(index)
    }

    pub fn num_xorb(&self) -> usize {
        self.xorb_info_views.len()
    }

    pub fn xorb(&self, index: usize) -> Option<&MDBXorbInfoView> {
        self.xorb_info_views.get(index)
    }

    // returns 0 if with_verification is true but the shard has no verification information.
    pub fn serialized_size(&self, with_verification: bool) -> usize {
        if with_verification && !self.has_file_verification() {
            return 0;
        }
        size_of::<MDBShardFileHeader>()
            + self
                .file_info_views
                .iter()
                .fold(0, |acc, fiv| acc + fiv.byte_size(with_verification))
            + size_of::<FileDataSequenceHeader>() // bookend of file section
            + self.xorb_info_views.iter().fold(0, |acc, civ| acc + civ.byte_size())
            + size_of::<XorbChunkSequenceHeader>() // bookend for xorb info section
            + size_of::<MDBShardFileFooter>()
    }

    /// Return a lookup of xorb hash to starting chunk indices for all the files present in the
    /// shard.  These are the chunks that are useful for global dedup.
    fn file_start_entries(&self) -> MerkleHashMap<Vec<usize>> {
        let mut file_start_entries = MerkleHashMap::<Vec<usize>>::new();

        for f_idx in 0..self.num_files() {
            let Some(fv) = self.file(f_idx) else {
                break;
            };

            if fv.num_entries() > 0 {
                let entry = fv.entry(0);
                let xorb_hash = entry.xorb_hash;
                let idx = entry.chunk_index_start;

                file_start_entries.entry(xorb_hash).or_default().push(idx as usize);
            }
        }

        // Sort all the individual entries.
        for v in file_start_entries.values_mut() {
            v.sort_unstable();
            v.dedup();
        }

        file_start_entries
    }

    /// Implementation for the xorb serialization function.  Use one of the methods below
    /// to directly access this.
    fn serialize_impl<W: Write>(
        &self,
        writer: &mut W,
        with_file_section: bool,
        with_verification: bool,
        expiry: Option<SystemTime>,
        xorb_filter_fn: impl Fn(&MDBXorbInfoView) -> bool,
    ) -> Result<usize> {
        let mut bytes = 0;

        bytes += MDBShardFileHeader::default().serialize(writer)?;

        // Now, to serialize this correctly, we need to go through and calculate all the stored information
        // as given in the file and xorb section
        let mut stored_bytes_on_disk = 0;
        let mut stored_bytes = 0;
        let mut materialized_bytes = 0;

        // When adding in the global dedup flags based on the files present in the shard, we first need to get
        // a lookup of which chunks occur at the start of a file.  These are the ones for which we set the
        // global dedup eligibility flag.
        //
        // In addition, we propagate the global dedup eligibility flag if it is already present.
        //
        let file_start_chunks = self.file_start_entries();

        let fs_start = bytes as u64;

        if with_file_section {
            for file_info in &self.file_info_views {
                for j in 0..file_info.num_entries() {
                    let segment_info = file_info.entry(j);
                    materialized_bytes += segment_info.unpacked_segment_bytes as u64;
                }
                bytes += file_info.serialize(writer, with_verification)?;
            }
        }
        bytes += FileDataSequenceHeader::bookend().serialize(writer)?;

        let cs_start = bytes as u64;
        for xorb_info in &self.xorb_info_views {
            // Skip any filtered sections.
            if !xorb_filter_fn(xorb_info) {
                continue;
            }

            stored_bytes_on_disk += xorb_info.header().num_bytes_on_disk as u64;
            stored_bytes += xorb_info.header().num_bytes_in_xorb as u64;

            if let Some(gde_indices) = file_start_chunks.get(&xorb_info.xorb_hash()) {
                debug_assert!(gde_indices.is_sorted());
                bytes += xorb_info.serialize_with_chunk_rewrite(writer, |idx, chunk| {
                    if gde_indices.binary_search(&idx).is_ok() {
                        chunk.with_global_dedup_flag(true)
                    } else {
                        chunk
                    }
                })?;
            } else {
                bytes += xorb_info.serialize(writer)?;
            }
        }
        bytes += XorbChunkSequenceHeader::bookend().serialize(writer)?;

        let footer_start = bytes as u64;

        // Now fill out the footer and write it out.
        bytes += MDBShardFileFooter {
            file_info_offset: fs_start,
            xorb_info_offset: cs_start,
            file_lookup_offset: footer_start,
            file_lookup_num_entry: 0,
            xorb_lookup_offset: footer_start,
            xorb_lookup_num_entry: 0,
            chunk_lookup_offset: footer_start,
            chunk_lookup_num_entry: 0,
            shard_creation_timestamp: current_timestamp(),
            shard_key_expiry: expiry
                .map_or(0, |t| t.duration_since(std::time::UNIX_EPOCH).unwrap_or_default().as_secs()),
            stored_bytes_on_disk,
            materialized_bytes,
            stored_bytes,
            footer_offset: footer_start,
            ..Default::default()
        }
        .serialize(writer)?;

        Ok(bytes)
    }

    /// Serialize out a shard without any of the file information and a subset of xorb data that is given
    /// by the xorb_filter_fn.  Global deduplication chunk information is preserved.
    pub fn serialize_xorb_subset_only<W: Write>(
        &self,
        writer: &mut W,
        xorb_filter_fn: impl Fn(&MDBXorbInfoView) -> bool,
    ) -> Result<usize> {
        self.serialize_impl(writer, false, false, None, xorb_filter_fn)
    }

    /// Serialize out a shard without file information, with the given expiration time set in the footer.
    /// Pass `None` for no expiration.
    pub fn serialize_xorb_subset_with_expiry<W: Write>(
        &self,
        writer: &mut W,
        expiry: Option<SystemTime>,
        xorb_filter_fn: impl Fn(&MDBXorbInfoView) -> bool,
    ) -> Result<usize> {
        self.serialize_impl(writer, false, false, expiry, xorb_filter_fn)
    }

    /// Serialize out the given shard, sanitizing and updating the global dedup chunk flags and optionally
    /// dropping the file verification section.
    pub fn serialize<W: Write>(&self, writer: &mut W, with_verification: bool) -> Result<usize> {
        self.serialize_impl(writer, true, with_verification, None, |_| true)
    }

    /// Returns a list of all the global dedup eligible chunks, as given either by the hash value, file starts, or
    /// the embedded global dedup flags.
    pub fn global_dedup_eligible_chunks(&self) -> Vec<MerkleHash> {
        // We need to get a list of all the chunk hashes that
        //   - References the first chunk of a file, or
        //   - hash_is_global_dedup_eligible(&hash) is true, or
        //   - has the global dedup flag set.

        let mut ret = HashSet::<MerkleHash>::new();

        // To do the file lookup part efficiently, first scan through the files and record
        // a lookup of xorb hash to offset.  Thus when scanning through the xorb definitions,
        // we can easily extract the hashes that match these indices.
        let file_start_entries = self.file_start_entries();

        for xorb_idx in 0..self.num_xorb() {
            let Some(xorb_view) = self.xorb(xorb_idx) else {
                break;
            };

            let num_entries = xorb_view.num_entries();

            if let Some(fse) = file_start_entries.get(&xorb_view.xorb_hash()) {
                for &c_idx in fse {
                    debug_assert_lt!(c_idx, num_entries);

                    // Check bounds to be safe here to ensure things don't crash in production; would be
                    // an error and fail verification elsewhere.
                    if c_idx < num_entries {
                        let chunk_hash = xorb_view.chunk(c_idx).chunk_hash;
                        ret.insert(chunk_hash);
                    }
                }
            }

            for c_idx in 0..num_entries {
                let chunk = xorb_view.chunk(c_idx);

                if chunk.is_global_dedup_eligible() {
                    ret.insert(chunk.chunk_hash);
                }
            }
        }

        Vec::from_iter(ret)
    }
}

#[cfg(test)]
mod tests {
    use std::collections::{HashMap, HashSet};
    use std::io::Cursor;
    use std::time::{Duration, SystemTime};

    use rand::rngs::SmallRng;
    use rand::{RngExt, SeedableRng};

    use super::super::MDBShardInfo;
    use super::super::file_structs::MDBFileInfo;
    use super::super::shard_file::test_routines::{
        convert_to_file, gen_random_shard, gen_random_shard_with_xorb_references,
    };
    use super::super::shard_in_memory::MDBInMemoryShard;
    use super::super::xorb_structs::MDBXorbInfo;
    use super::MDBMinimalShard;
    use crate::error::Result;
    use crate::merklehash::MerkleHash;

    fn verify_serialization(min_shard: &MDBMinimalShard, mem_shard: &MDBInMemoryShard) -> Result<()> {
        for verification in [true, false] {
            // compute size, with verification if possible only
            let size = min_shard.serialized_size(min_shard.has_file_verification() && verification);
            assert_ne!(0, size);

            // if lacking verification, assert that getting the size with verification returns 0
            if !min_shard.has_file_verification() {
                assert_eq!(0, min_shard.serialized_size(true))
            }

            // Now verify that the serialized version is the same too.
            let mut reloaded_shard = Vec::new();
            let serialize_result = min_shard.serialize(&mut reloaded_shard, verification);
            if !min_shard.has_file_verification() && verification && min_shard.num_files() > 0 {
                assert!(serialize_result.is_err());
                continue;
            }
            assert!(serialize_result.is_ok());
            let serialized_len = serialize_result?;
            assert_eq!(reloaded_shard.len(), serialized_len);
            assert_eq!(size, serialized_len);

            let si = MDBShardInfo::load_from_reader(&mut Cursor::new(&reloaded_shard)).unwrap();

            let file_info: Vec<MDBFileInfo> =
                si.read_all_file_info_sections(&mut Cursor::new(&reloaded_shard)).unwrap();
            let mem_file_info: Vec<_> = mem_shard.file_content.clone().into_values().collect();

            for (i, (read, mem)) in file_info.iter().zip(mem_file_info.iter()).enumerate() {
                assert!(read.equal_accepting_no_verification(mem), "i: {i} verification = {verification}");
            }

            let xorb_info: Vec<MDBXorbInfo> = si.read_all_xorb_blocks_full(&mut Cursor::new(&reloaded_shard)).unwrap();
            let mem_xorb_info: Vec<_> = mem_shard.xorb_content.clone().into_values().collect();

            assert_eq!(xorb_info.len(), mem_xorb_info.len(), "verification = {verification}");

            // Test for equality while ignoring the global dedup flag, as this gets modified on reserializing.
            for i in 0..xorb_info.len() {
                let c1 = &xorb_info[i];
                let c2 = mem_xorb_info[i].as_ref();

                assert_eq!(c1.metadata, c2.metadata);

                for (ch1, ch2) in c1.chunks.iter().zip(c2.chunks.iter()) {
                    // Clear the global dedup one on the new serialized version, as it may have been set.
                    let ch1 = ch1.clone().with_global_dedup_flag(false);
                    assert_eq!(&ch1, ch2);
                }
            }
        }

        Ok(())
    }

    async fn verify_minimal_shard(mem_shard: &MDBInMemoryShard) -> Result<()> {
        let buffer = convert_to_file(mem_shard)?;

        {
            let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();
            let min_shard_async = MDBMinimalShard::from_reader_async(&mut &buffer[..], true, true).await.unwrap();

            assert_eq!(min_shard, min_shard_async);

            // Verify From trait implementations for views
            for i in 0..min_shard.num_files() {
                let file_view = min_shard.file(i).unwrap();
                let file_info = MDBFileInfo::from(file_view);
                assert_eq!(file_info.metadata.file_hash, file_view.file_hash());
                assert_eq!(file_info.segments.len(), file_view.num_entries());
                assert_eq!(file_info.contains_verification(), file_view.contains_verification());
                assert_eq!(file_info.contains_metadata_ext(), file_view.contains_metadata_ext());
            }

            for i in 0..min_shard.num_xorb() {
                let xorb_view = min_shard.xorb(i).unwrap();
                let xorb_info = MDBXorbInfo::from(xorb_view);
                assert_eq!(xorb_info.metadata.xorb_hash, xorb_view.xorb_hash());
                assert_eq!(xorb_info.chunks.len(), xorb_view.num_entries());
            }

            verify_serialization(&min_shard, mem_shard).unwrap();
        }

        {
            // Test we're good on the ones without xorb entries.
            let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, false).unwrap();
            let min_shard_async = MDBMinimalShard::from_reader_async(&mut &buffer[..], true, false).await.unwrap();

            assert_eq!(min_shard, min_shard_async);

            let mut file_only_memshard = mem_shard.clone();
            file_only_memshard.xorb_content.clear();
            file_only_memshard.chunk_hash_lookup.clear();

            verify_serialization(&min_shard, &file_only_memshard).unwrap();
        }

        // Test we're good on the ones without file entries.
        {
            let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), false, true).unwrap();
            let min_shard_async = MDBMinimalShard::from_reader_async(&mut &buffer[..], false, true).await.unwrap();

            assert_eq!(min_shard, min_shard_async);

            let mut xorb_only_memshard = mem_shard.clone();
            xorb_only_memshard.file_content.clear();

            verify_serialization(&min_shard, &xorb_only_memshard).unwrap();
        }

        // Test custom callbacks
        {
            let mut file_info_views = vec![];
            let mut xorb_info_views = vec![];

            let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();
            let min_shard_async = MDBMinimalShard::from_reader_async_with_custom_callbacks(
                &mut &buffer[..],
                true,
                true,
                |f| {
                    file_info_views.push(f.clone());
                    Ok(())
                },
                |c| {
                    xorb_info_views.push(c.clone());
                    Ok(())
                },
            )
            .await
            .unwrap();

            assert_eq!(min_shard, min_shard_async);
            assert_eq!(file_info_views, min_shard.file_info_views);
            assert_eq!(xorb_info_views, min_shard.xorb_info_views);

            let mut xorb_only_memshard = mem_shard.clone();
            xorb_only_memshard.file_content.clear();

            verify_serialization(&min_shard, mem_shard).unwrap();
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_shards() -> Result<()> {
        let shard = gen_random_shard(0, &[], &[0], false, false)?;
        verify_minimal_shard(&shard).await?;

        // Tests to make sure the async and non-async match.
        let shard = gen_random_shard(0, &[1], &[1, 1], false, false)?;
        verify_minimal_shard(&shard).await?;

        let shard = gen_random_shard(0, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], false, false)?;
        verify_minimal_shard(&shard).await?;

        let shard = gen_random_shard(0, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], true, false)?;
        verify_minimal_shard(&shard).await?;

        let shard = gen_random_shard(0, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], false, true)?;
        verify_minimal_shard(&shard).await?;

        let shard = gen_random_shard(0, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], true, true)?;
        verify_minimal_shard(&shard).await?;

        Ok(())
    }

    async fn verify_minimal_shard_dedup_processing(mem_shard: &MDBInMemoryShard) {
        verify_minimal_shard(mem_shard).await.unwrap();

        // Additionally, verify that the exporting functions work properly.
        let buffer = convert_to_file(mem_shard).unwrap();
        let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();

        // Calculate the global_dedup chunks.
        let ref_global_dedup_chunks: HashSet<_> = min_shard.global_dedup_eligible_chunks().into_iter().collect();

        // Produce a new minimal shard without the file info.
        let mut xorb_only_shard_buffer = Vec::<u8>::new();
        min_shard
            .serialize_xorb_subset_only(&mut xorb_only_shard_buffer, |_| true)
            .unwrap();

        let xorb_only_shard =
            MDBMinimalShard::from_reader(&mut Cursor::new(&xorb_only_shard_buffer), true, true).unwrap();

        let global_dedup_chunks: HashSet<_> = xorb_only_shard.global_dedup_eligible_chunks().into_iter().collect();

        // Now make sure these are the same.
        assert_eq!(ref_global_dedup_chunks, global_dedup_chunks);

        // Now, exclude subsets of the xorbs for testing to make sure that the filtering works properly.
        //
        // We'll do the filtering by excluding the xorbs with index in the given shard list less
        // than a given value in a set.
        //
        // Annoyingly, our test setup allows some duplication between the chunks in the xorbs, so we end up
        // having to account for that in the tests by allowing a chunk to be in multiple xorbs.
        let mut chunk_hashes = HashMap::<MerkleHash, Vec<usize>>::new();
        let mut xorb_map = HashMap::<MerkleHash, usize>::new();

        let mut rng = SmallRng::seed_from_u64(0);

        for xi in 0..min_shard.num_xorb() {
            let xorb = min_shard.xorb(xi).unwrap();
            let group = rng.random_range(0..=3);

            xorb_map.insert(xorb.xorb_hash(), group);
            for ci in 0..xorb.num_entries() {
                let chunk_hash = xorb.chunk(ci).chunk_hash;
                if ref_global_dedup_chunks.contains(&chunk_hash) {
                    chunk_hashes.entry(chunk_hash).or_default().push(group);
                }
            }
        }

        // Exclude xorbs with set index as given above.
        for grp_set_threshhold in 1..4 {
            let xorb_filter_fn = |xh| *xorb_map.get(&xh).unwrap() < grp_set_threshhold;

            // Get the reference set of xorbs.
            let ref_filtered_xorbs: HashSet<MerkleHash> =
                xorb_map.keys().filter(|&&xh| xorb_filter_fn(xh)).cloned().collect();

            let ref_filtered_global_dedup_chunks: HashSet<_> = chunk_hashes
                .iter()
                .filter(|(_, grp_set): &(&MerkleHash, &Vec<usize>)| grp_set.iter().any(|&grp| grp < grp_set_threshhold))
                .map(|(&ch, _)| ch)
                .collect();

            let mut xo_subset_shard_buffer = Vec::<u8>::new();
            min_shard
                .serialize_xorb_subset_only(&mut xo_subset_shard_buffer, |xorb| xorb_filter_fn(xorb.xorb_hash()))
                .unwrap();

            let xo_subset_shard =
                MDBMinimalShard::from_reader(&mut Cursor::new(&xo_subset_shard_buffer), true, true).unwrap();

            assert_eq!(xo_subset_shard.num_files(), 0);
            assert_eq!(xo_subset_shard.num_xorb(), ref_filtered_xorbs.len());

            let xorbs_present: HashSet<_> = (0..xo_subset_shard.num_xorb())
                .map(|i| xo_subset_shard.xorb(i).unwrap().xorb_hash())
                .collect();

            assert_eq!(xorbs_present, ref_filtered_xorbs);

            let xo_global_dedup_chunks: HashSet<_> =
                xo_subset_shard.global_dedup_eligible_chunks().into_iter().collect();

            assert_eq!(ref_filtered_global_dedup_chunks, xo_global_dedup_chunks);
        }
    }

    // Tests to verify that all the shard filtering options are supported.
    #[tokio::test]
    async fn test_shard_processing() {
        let shard = gen_random_shard_with_xorb_references(1, &[1], &[1], false, false).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;

        // Tests to make sure the async and non-async match.
        let shard = gen_random_shard_with_xorb_references(1, &[2], &[1, 1], false, false).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;

        let shard =
            gen_random_shard_with_xorb_references(1, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], false, false).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;

        let shard = gen_random_shard_with_xorb_references(1, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], true, false).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;

        let shard = gen_random_shard_with_xorb_references(1, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], false, true).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;

        let shard = gen_random_shard_with_xorb_references(1, &[1, 5, 10, 8], &[4, 3, 5, 9, 4, 6], true, true).unwrap();
        verify_minimal_shard_dedup_processing(&shard).await;
    }

    #[test]
    fn test_serialize_xorb_subset_with_expiry_footer() {
        let shard = gen_random_shard_with_xorb_references(1, &[1, 2], &[3, 2], true, true).unwrap();
        let buffer = convert_to_file(&shard).unwrap();
        let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();

        let mut no_expiry_buffer = Vec::new();
        min_shard
            .serialize_xorb_subset_with_expiry(&mut no_expiry_buffer, None, |_| true)
            .unwrap();
        let no_expiry_info = MDBShardInfo::load_from_reader(&mut Cursor::new(&no_expiry_buffer)).unwrap();
        assert_eq!(no_expiry_info.metadata.shard_key_expiry, 0);

        let expiry_secs = super::current_timestamp().saturating_add(12345);
        let expiry = SystemTime::UNIX_EPOCH + Duration::from_secs(expiry_secs);
        let mut expiry_buffer = Vec::new();
        min_shard
            .serialize_xorb_subset_with_expiry(&mut expiry_buffer, Some(expiry), |_| true)
            .unwrap();

        let expiry_info = MDBShardInfo::load_from_reader(&mut Cursor::new(&expiry_buffer)).unwrap();
        assert_eq!(expiry_info.metadata.shard_key_expiry, expiry_secs);
        assert!(expiry_info.metadata.shard_key_expiry > super::current_timestamp());
    }
}