jam-rs 0.9.10

Just another (genomic) minhash (Jam) implementation in Rust
Documentation
#[cfg(not(target_endian = "little"))]
compile_error!("JAM format requires a little-endian platform");

use bytemuck::{Pod, Zeroable};

pub const MAGIC: [u8; 4] = *b"JAM\0";
pub const VERSION: u32 = 3;

pub const PAGE_SIZE: usize = 4096;

#[inline]
pub const fn align_to_page(offset: usize) -> usize {
    (offset + PAGE_SIZE - 1) & !(PAGE_SIZE - 1)
}
pub const BUCKET_COUNT: usize = 256;
pub const BUCKET_BITS: u8 = 8;
pub const ENTRY_SIZE: usize = 12;
pub const HEADER_SIZE: usize = 160;
pub const BUCKET_META_SIZE: usize = 32;
pub const BUCKET_TABLE_SIZE: usize = BUCKET_COUNT * BUCKET_META_SIZE;
pub const DATA_START: usize = HEADER_SIZE + BUCKET_TABLE_SIZE;

#[inline(always)]
pub fn bucket_id(hash: u64) -> usize {
    (hash & 0xFF) as usize
}

#[repr(C)]
#[derive(Debug, Clone, Copy, Pod, Zeroable)]
pub struct Header {
    pub magic: [u8; 4],
    pub version: u32,
    pub flags: u64,

    pub entry_count: u64,
    pub unique_hash_count: u64,
    pub sample_count: u32,
    pub bucket_count: u16,
    pub bucket_bits: u8,
    pub entry_size: u8,

    pub hash_threshold: u64,
    pub kmer_size: u8,
    pub _param_reserved: [u8; 7],

    pub bucket_table_offset: u64,
    pub entries_offset: u64,
    pub filters_offset: u64,
    pub bias_table_offset: u64,

    pub entries_size: u64,
    pub filters_size: u64,
    pub bias_table_size: u64,

    pub sample_names_offset: u64,
    pub sample_names_size: u64,
    pub sample_sizes_offset: u64,
    pub sample_sizes_size: u64,

    pub _padding: [u8; 16],
}

pub const FLAG_HAS_BIAS_TABLE: u64 = 1 << 0;

const _: () = assert!(std::mem::size_of::<Header>() == 160);

impl Header {
    pub fn validate(&self) -> Result<(), FormatError> {
        if self.magic != MAGIC {
            return Err(FormatError::InvalidMagic(self.magic));
        }
        if self.version != VERSION {
            return Err(FormatError::UnsupportedVersion(self.version));
        }
        if self.bucket_count != BUCKET_COUNT as u16 {
            return Err(FormatError::InvalidBucketCount(self.bucket_count));
        }
        if self.entry_size != ENTRY_SIZE as u8 {
            return Err(FormatError::InvalidEntrySize(self.entry_size));
        }
        if self.hash_threshold == 0 {
            return Err(FormatError::InvalidHashThreshold);
        }
        Ok(())
    }
}

#[repr(C)]
#[derive(Debug, Clone, Copy, Pod, Zeroable, Default)]
pub struct BucketMeta {
    pub entry_offset: u64,
    pub entry_count: u64,
    pub filter_offset: u64,
    pub filter_size: u64,
}

const _: () = assert!(std::mem::size_of::<BucketMeta>() == 32);

#[repr(C, packed)]
#[derive(Debug, Clone, Copy, Pod, Zeroable, PartialEq, Eq, PartialOrd, Ord)]
pub struct Entry {
    pub hash: u64,
    pub sample_id: u32,
}

const _: () = assert!(std::mem::size_of::<Entry>() == 12);

impl Entry {
    #[inline]
    pub fn new(hash: u64, sample_id: u32) -> Self {
        Self { hash, sample_id }
    }

    #[inline]
    pub fn bucket_id(&self) -> usize {
        bucket_id(self.hash)
    }
}

#[derive(Debug, thiserror::Error)]
pub enum FormatError {
    #[error("Invalid magic bytes: {0:?}")]
    InvalidMagic([u8; 4]),

    #[error("Unsupported version: {0}")]
    UnsupportedVersion(u32),

    #[error("Invalid bucket count: {0}")]
    InvalidBucketCount(u16),

    #[error("Invalid entry size: {0}")]
    InvalidEntrySize(u8),

    #[error("Invalid hash threshold: must be > 0")]
    InvalidHashThreshold,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_struct_sizes() {
        assert_eq!(std::mem::size_of::<Header>(), 160);
        assert_eq!(std::mem::size_of::<BucketMeta>(), 32);
        assert_eq!(std::mem::size_of::<Entry>(), 12);
    }

    #[test]
    fn test_bucket_id() {
        assert_eq!(bucket_id(0x0000_0000_0000_0000), 0);
        assert_eq!(bucket_id(0x0000_0000_0000_00FF), 255);
        assert_eq!(bucket_id(0xFFFF_FFFF_FFFF_FF00), 0);
        assert_eq!(bucket_id(0xABCD_EF12_3456_7842), 0x42);
    }

    #[test]
    fn test_entry_ordering() {
        let e1 = Entry::new(100, 1);
        let e2 = Entry::new(100, 2);
        let e3 = Entry::new(200, 1);

        assert!(e1 < e2);
        assert!(e2 < e3);
        assert!(e1 < e3);
    }

    #[test]
    fn test_bucket_id_distribution() {
        let threshold: u64 = (u64::MAX as f64 * 0.001) as u64;
        let mut bucket_counts = [0usize; 256];

        for i in 0..100_000u64 {
            let hash = i.wrapping_mul(0x517cc1b727220a95) % threshold;
            bucket_counts[bucket_id(hash)] += 1;
        }

        let avg = 100_000 / 256;
        for (i, &count) in bucket_counts.iter().enumerate() {
            let deviation = (count as f64 - avg as f64).abs() / avg as f64;
            assert!(deviation < 0.3, "Bucket {} has skewed count: {}", i, count);
        }
    }

    #[test]
    fn test_header_validate_valid() {
        let mut header = Header::zeroed();
        header.magic = MAGIC;
        header.version = VERSION;
        header.bucket_count = BUCKET_COUNT as u16;
        header.entry_size = ENTRY_SIZE as u8;
        header.hash_threshold = u64::MAX; // valid non-zero threshold
        assert!(header.validate().is_ok());
    }

    #[test]
    fn test_header_validate_zero_threshold() {
        let mut header = Header::zeroed();
        header.magic = MAGIC;
        header.version = VERSION;
        header.bucket_count = BUCKET_COUNT as u16;
        header.entry_size = ENTRY_SIZE as u8;
        header.hash_threshold = 0; // invalid
        assert!(matches!(
            header.validate(),
            Err(FormatError::InvalidHashThreshold)
        ));
    }

    #[test]
    fn test_header_validate_bad_magic() {
        let mut header = Header::zeroed();
        header.magic = *b"BAD\0";
        header.version = VERSION;
        header.bucket_count = BUCKET_COUNT as u16;
        header.entry_size = ENTRY_SIZE as u8;
        header.hash_threshold = u64::MAX;
        assert!(matches!(
            header.validate(),
            Err(FormatError::InvalidMagic(_))
        ));
    }

    #[test]
    fn test_header_validate_bad_version() {
        let mut header = Header::zeroed();
        header.magic = MAGIC;
        header.version = 99;
        header.bucket_count = BUCKET_COUNT as u16;
        header.entry_size = ENTRY_SIZE as u8;
        header.hash_threshold = u64::MAX;
        assert!(matches!(
            header.validate(),
            Err(FormatError::UnsupportedVersion(99))
        ));
    }

    #[test]
    fn test_entry_bucket_id() {
        let entry = Entry::new(0xABCD_EF12_3456_7842, 5);
        assert_eq!(entry.bucket_id(), 0x42);
    }
}