Skip to main content

jam_rs/
format.rs

1#[cfg(not(target_endian = "little"))]
2compile_error!("JAM format requires a little-endian platform");
3
4use bytemuck::{Pod, Zeroable};
5
6pub const MAGIC: [u8; 4] = *b"JAM\0";
7pub const VERSION: u32 = 3;
8
9pub const PAGE_SIZE: usize = 4096;
10
11#[inline]
12pub const fn align_to_page(offset: usize) -> usize {
13    (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1)
14}
15pub const BUCKET_COUNT: usize = 256;
16pub const BUCKET_BITS: u8 = 8;
17pub const ENTRY_SIZE: usize = 12;
18pub const HEADER_SIZE: usize = 160;
19pub const BUCKET_META_SIZE: usize = 32;
20pub const BUCKET_TABLE_SIZE: usize = BUCKET_COUNT * BUCKET_META_SIZE;
21pub const DATA_START: usize = HEADER_SIZE + BUCKET_TABLE_SIZE;
22
23#[inline(always)]
24pub fn bucket_id(hash: u64) -> usize {
25    (hash & 0xFF) as usize
26}
27
28#[repr(C)]
29#[derive(Debug, Clone, Copy, Pod, Zeroable)]
30pub struct Header {
31    pub magic: [u8; 4],
32    pub version: u32,
33    pub flags: u64,
34
35    pub entry_count: u64,
36    pub unique_hash_count: u64,
37    pub sample_count: u32,
38    pub bucket_count: u16,
39    pub bucket_bits: u8,
40    pub entry_size: u8,
41
42    pub hash_threshold: u64,
43    pub kmer_size: u8,
44    pub _param_reserved: [u8; 7],
45
46    pub bucket_table_offset: u64,
47    pub entries_offset: u64,
48    pub filters_offset: u64,
49    pub bias_table_offset: u64,
50
51    pub entries_size: u64,
52    pub filters_size: u64,
53    pub bias_table_size: u64,
54
55    pub sample_names_offset: u64,
56    pub sample_names_size: u64,
57    pub sample_sizes_offset: u64,
58    pub sample_sizes_size: u64,
59
60    pub _padding: [u8; 16],
61}
62
63pub const FLAG_HAS_BIAS_TABLE: u64 = 1 << 0;
64
65const _: () = assert!(std::mem::size_of::<Header>() == 160);
66
67impl Header {
68    pub fn validate(&self) -> Result<(), FormatError> {
69        if self.magic != MAGIC {
70            return Err(FormatError::InvalidMagic(self.magic));
71        }
72        if self.version != VERSION {
73            return Err(FormatError::UnsupportedVersion(self.version));
74        }
75        if self.bucket_count != BUCKET_COUNT as u16 {
76            return Err(FormatError::InvalidBucketCount(self.bucket_count));
77        }
78        if self.entry_size != ENTRY_SIZE as u8 {
79            return Err(FormatError::InvalidEntrySize(self.entry_size));
80        }
81        if self.hash_threshold == 0 {
82            return Err(FormatError::InvalidHashThreshold);
83        }
84        Ok(())
85    }
86}
87
88#[repr(C)]
89#[derive(Debug, Clone, Copy, Pod, Zeroable, Default)]
90pub struct BucketMeta {
91    pub entry_offset: u64,
92    pub entry_count: u64,
93    pub filter_offset: u64,
94    pub filter_size: u64,
95}
96
97const _: () = assert!(std::mem::size_of::<BucketMeta>() == 32);
98
99#[repr(C, packed)]
100#[derive(Debug, Clone, Copy, Pod, Zeroable, PartialEq, Eq, PartialOrd, Ord)]
101pub struct Entry {
102    pub hash: u64,
103    pub sample_id: u32,
104}
105
106const _: () = assert!(std::mem::size_of::<Entry>() == 12);
107
108impl Entry {
109    #[inline]
110    pub fn new(hash: u64, sample_id: u32) -> Self {
111        Self { hash, sample_id }
112    }
113
114    #[inline]
115    pub fn bucket_id(&self) -> usize {
116        bucket_id(self.hash)
117    }
118}
119
120#[derive(Debug, thiserror::Error)]
121pub enum FormatError {
122    #[error("Invalid magic bytes: {0:?}")]
123    InvalidMagic([u8; 4]),
124
125    #[error("Unsupported version: {0}")]
126    UnsupportedVersion(u32),
127
128    #[error("Invalid bucket count: {0}")]
129    InvalidBucketCount(u16),
130
131    #[error("Invalid entry size: {0}")]
132    InvalidEntrySize(u8),
133
134    #[error("Invalid hash threshold: must be > 0")]
135    InvalidHashThreshold,
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn test_struct_sizes() {
144        assert_eq!(std::mem::size_of::<Header>(), 160);
145        assert_eq!(std::mem::size_of::<BucketMeta>(), 32);
146        assert_eq!(std::mem::size_of::<Entry>(), 12);
147    }
148
149    #[test]
150    fn test_bucket_id() {
151        assert_eq!(bucket_id(0x0000_0000_0000_0000), 0);
152        assert_eq!(bucket_id(0x0000_0000_0000_00FF), 255);
153        assert_eq!(bucket_id(0xFFFF_FFFF_FFFF_FF00), 0);
154        assert_eq!(bucket_id(0xABCD_EF12_3456_7842), 0x42);
155    }
156
157    #[test]
158    fn test_entry_ordering() {
159        let e1 = Entry::new(100, 1);
160        let e2 = Entry::new(100, 2);
161        let e3 = Entry::new(200, 1);
162
163        assert!(e1 < e2);
164        assert!(e2 < e3);
165        assert!(e1 < e3);
166    }
167
168    #[test]
169    fn test_bucket_id_distribution() {
170        let threshold: u64 = (u64::MAX as f64 * 0.001) as u64;
171        let mut bucket_counts = [0usize; 256];
172
173        for i in 0..100_000u64 {
174            let hash = i.wrapping_mul(0x517cc1b727220a95) % threshold;
175            bucket_counts[bucket_id(hash)] += 1;
176        }
177
178        let avg = 100_000 / 256;
179        for (i, &count) in bucket_counts.iter().enumerate() {
180            let deviation = (count as f64 - avg as f64).abs() / avg as f64;
181            assert!(deviation < 0.3, "Bucket {} has skewed count: {}", i, count);
182        }
183    }
184
185    #[test]
186    fn test_header_validate_valid() {
187        let mut header = Header::zeroed();
188        header.magic = MAGIC;
189        header.version = VERSION;
190        header.bucket_count = BUCKET_COUNT as u16;
191        header.entry_size = ENTRY_SIZE as u8;
192        header.hash_threshold = u64::MAX; // valid non-zero threshold
193        assert!(header.validate().is_ok());
194    }
195
196    #[test]
197    fn test_header_validate_zero_threshold() {
198        let mut header = Header::zeroed();
199        header.magic = MAGIC;
200        header.version = VERSION;
201        header.bucket_count = BUCKET_COUNT as u16;
202        header.entry_size = ENTRY_SIZE as u8;
203        header.hash_threshold = 0; // invalid
204        assert!(matches!(
205            header.validate(),
206            Err(FormatError::InvalidHashThreshold)
207        ));
208    }
209
210    #[test]
211    fn test_header_validate_bad_magic() {
212        let mut header = Header::zeroed();
213        header.magic = *b"BAD\0";
214        header.version = VERSION;
215        header.bucket_count = BUCKET_COUNT as u16;
216        header.entry_size = ENTRY_SIZE as u8;
217        header.hash_threshold = u64::MAX;
218        assert!(matches!(
219            header.validate(),
220            Err(FormatError::InvalidMagic(_))
221        ));
222    }
223
224    #[test]
225    fn test_header_validate_bad_version() {
226        let mut header = Header::zeroed();
227        header.magic = MAGIC;
228        header.version = 99;
229        header.bucket_count = BUCKET_COUNT as u16;
230        header.entry_size = ENTRY_SIZE as u8;
231        header.hash_threshold = u64::MAX;
232        assert!(matches!(
233            header.validate(),
234            Err(FormatError::UnsupportedVersion(99))
235        ));
236    }
237
238    #[test]
239    fn test_entry_bucket_id() {
240        let entry = Entry::new(0xABCD_EF12_3456_7842, 5);
241        assert_eq!(entry.bucket_id(), 0x42);
242    }
243}