use dryice::{DryIceError, RecordKey};
#[allow(clippy::cast_possible_truncation)]
const fn make_tag(n: usize) -> [u8; 16] {
let mut tag = *b"spill:seq2b:0000";
tag[12] = b'0' + ((n / 1000) % 10) as u8;
tag[13] = b'0' + ((n / 100) % 10) as u8;
tag[14] = b'0' + ((n / 10) % 10) as u8;
tag[15] = b'0' + (n % 10) as u8;
tag
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct PackedSequenceKey<const N: usize>(pub [u8; N]);
impl<const N: usize> PackedSequenceKey<N> {
pub const BASES: usize = N * 4;
#[must_use]
pub fn from_sequence(sequence: &[u8]) -> Self {
let mut key = [0u8; N];
let bases_to_pack = sequence.len().min(Self::BASES);
for (i, &base) in sequence[..bases_to_pack].iter().enumerate() {
let bits = match base {
b'C' | b'c' => 0b01,
b'G' | b'g' => 0b10,
b'T' | b't' => 0b11,
_ => 0b00,
};
let byte_idx = i / 4;
let bit_offset = 6 - (i % 4) * 2;
key[byte_idx] |= bits << bit_offset;
}
Self(key)
}
}
#[allow(clippy::cast_possible_truncation)]
impl<const N: usize> RecordKey for PackedSequenceKey<N> {
const WIDTH: u16 = N as u16;
const TYPE_TAG: [u8; 16] = make_tag(N);
fn encode_into(&self, out: &mut [u8]) {
debug_assert_eq!(out.len(), N);
out.copy_from_slice(&self.0);
}
fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError> {
let arr: [u8; N] = bytes
.try_into()
.map_err(|_| DryIceError::InvalidRecordKeyEncoding {
message: "packed sequence key length mismatch",
})?;
Ok(Self(arr))
}
}
pub type IlluminaKey = PackedSequenceKey<38>;
pub type PairedEndKey = PackedSequenceKey<64>;
pub type LongReadPrefixKey = PackedSequenceKey<128>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn pack_simple_sequence() {
let key = PackedSequenceKey::<2>::from_sequence(b"ACGTACGT");
assert_eq!(key.0, [0x1B, 0x1B]);
}
#[test]
fn pack_preserves_lexicographic_order() {
let key_a = PackedSequenceKey::<2>::from_sequence(b"AAAAAAAA");
let key_c = PackedSequenceKey::<2>::from_sequence(b"CCCCCCCC");
let key_g = PackedSequenceKey::<2>::from_sequence(b"GGGGGGGG");
let key_t = PackedSequenceKey::<2>::from_sequence(b"TTTTTTTT");
assert!(key_a < key_c);
assert!(key_c < key_g);
assert!(key_g < key_t);
}
#[test]
fn pack_short_sequence_zero_pads() {
let key = PackedSequenceKey::<4>::from_sequence(b"AC");
assert_eq!(key.0[0], 0x10);
assert_eq!(key.0[1], 0);
assert_eq!(key.0[2], 0);
assert_eq!(key.0[3], 0);
}
#[test]
fn short_sequence_sorts_before_longer_with_same_prefix() {
let short = PackedSequenceKey::<4>::from_sequence(b"AC");
let long = PackedSequenceKey::<4>::from_sequence(b"ACGTACGTACGTACGT");
assert!(
short < long,
"zero-padded short sequence should sort before longer one"
);
}
#[test]
fn pack_handles_lowercase() {
let upper = PackedSequenceKey::<2>::from_sequence(b"ACGTACGT");
let lower = PackedSequenceKey::<2>::from_sequence(b"acgtacgt");
assert_eq!(upper, lower, "case should not affect packing");
}
#[test]
fn pack_maps_ambiguous_to_a() {
let with_n = PackedSequenceKey::<1>::from_sequence(b"NCGT");
let with_a = PackedSequenceKey::<1>::from_sequence(b"ACGT");
assert_eq!(
with_n, with_a,
"ambiguous bases should map to A in the packed key"
);
}
#[test]
fn record_key_round_trips() {
let key = PackedSequenceKey::<8>::from_sequence(b"ACGTACGTACGTACGTACGTACGTACGTACGT");
let mut buf = vec![0u8; 8];
key.encode_into(&mut buf);
let decoded = PackedSequenceKey::<8>::decode_from(&buf).expect("decode should succeed");
assert_eq!(key, decoded);
}
#[test]
fn type_tag_encodes_width() {
assert_eq!(&PackedSequenceKey::<38>::TYPE_TAG, b"spill:seq2b:0038");
assert_eq!(&PackedSequenceKey::<64>::TYPE_TAG, b"spill:seq2b:0064");
assert_eq!(&PackedSequenceKey::<128>::TYPE_TAG, b"spill:seq2b:0128");
}
#[test]
fn illumina_key_covers_150bp() {
assert_eq!(IlluminaKey::BASES, 152, "38 bytes × 4 = 152 bases");
}
#[test]
fn paired_end_key_covers_256bp() {
assert_eq!(PairedEndKey::BASES, 256);
}
#[test]
fn long_read_prefix_key_covers_512bp() {
assert_eq!(LongReadPrefixKey::BASES, 512);
}
}