use crate::Error;
#[inline(always)]
pub fn as_4bit(seq: &[u8]) -> Result<u64, Error> {
if seq.len() > 16 {
return Err(Error::SequenceTooLong(seq.len()));
}
let mut packed = 0u64;
for (i, &base) in seq.iter().enumerate() {
let bits = match base {
b'A' | b'a' => 0b0000,
b'C' | b'c' => 0b0001,
b'G' | b'g' => 0b0010,
b'T' | b't' => 0b0011,
b'N' | b'n' => 0b1111, b'R' | b'r' => 0b1111, b'Y' | b'y' => 0b1111, b'S' | b's' => 0b1111, b'W' | b'w' => 0b1111, b'K' | b'k' => 0b1111, b'M' | b'm' => 0b1111, b'B' | b'b' => 0b1111, b'D' | b'd' => 0b1111, b'H' | b'h' => 0b1111, b'V' | b'v' => 0b1111, invalid => return Err(Error::InvalidBase(invalid)),
};
packed |= (bits as u64) << (i * 4);
}
Ok(packed)
}
pub fn encode_internal(sequence: &[u8], ebuf: &mut Vec<u64>) -> Result<(), Error> {
ebuf.clear();
let n_chunks = sequence.len().div_ceil(16);
let mut l_bounds = 0;
for _ in 0..n_chunks - 1 {
let r_bounds = l_bounds + 16;
let chunk = &sequence[l_bounds..r_bounds];
let bits = as_4bit(chunk)?;
ebuf.push(bits);
l_bounds = r_bounds;
}
let bits = as_4bit(&sequence[l_bounds..])?;
ebuf.push(bits);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_as_4bit_basic() {
let tests: Vec<(&'static [u8], u64)> = vec![
(b"A", 0b0000),
(b"C", 0b0001),
(b"G", 0b0010),
(b"T", 0b0011),
(b"ACGT", 0b0011001000010000), ];
for (input, expected) in tests {
assert_eq!(as_4bit(input).unwrap(), expected);
}
}
#[test]
fn test_as_4bit_ambiguous() {
let ambiguous_bases = [
b'N', b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V',
];
for &base in &ambiguous_bases {
let result = as_4bit(&[base]).unwrap();
assert_eq!(
result, 0b1111,
"Base {} should encode to 0b1111",
base as char
);
}
}
#[test]
fn test_as_4bit_case_insensitive() {
assert_eq!(as_4bit(b"acgt").unwrap(), as_4bit(b"ACGT").unwrap());
assert_eq!(as_4bit(b"nrys").unwrap(), as_4bit(b"NRYS").unwrap());
}
#[test]
fn test_as_4bit_sequence_too_long() {
let long_seq = vec![b'A'; 17];
assert!(matches!(
as_4bit(&long_seq),
Err(Error::SequenceTooLong(17))
));
}
#[test]
fn test_as_4bit_invalid_base() {
let result = as_4bit(b"ACGX");
assert!(matches!(result, Err(Error::InvalidBase(b'X'))));
}
#[test]
fn test_as_4bit_longer_sequences() {
let seq = b"ACGTACGTACGTACGT"; let result = as_4bit(seq);
assert!(result.is_ok());
let packed = result.unwrap();
for (i, &expected_base) in seq.iter().enumerate() {
let extracted_bits = (packed >> (i * 4)) & 0b1111;
let expected_bits = match expected_base {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' => 3,
_ => panic!("Unexpected base in test"),
};
assert_eq!(extracted_bits, expected_bits, "Mismatch at position {}", i);
}
}
#[test]
fn test_encode_internal() {
let sequence = b"ACGTACGTACGTACGTACGTACGTACGTACGT"; let mut ebuf = Vec::new();
encode_internal(sequence, &mut ebuf).unwrap();
assert_eq!(ebuf.len(), 2);
let first_chunk_expected = as_4bit(&sequence[..16]).unwrap();
assert_eq!(ebuf[0], first_chunk_expected);
let second_chunk_expected = as_4bit(&sequence[16..]).unwrap();
assert_eq!(ebuf[1], second_chunk_expected);
}
#[test]
fn test_encode_internal_partial_chunk() {
let sequence = b"ACGTACGTACGTACGTACGT"; let mut ebuf = Vec::new();
encode_internal(sequence, &mut ebuf).unwrap();
assert_eq!(ebuf.len(), 2);
let first_chunk_expected = as_4bit(&sequence[..16]).unwrap();
assert_eq!(ebuf[0], first_chunk_expected);
let second_chunk_expected = as_4bit(&sequence[16..]).unwrap();
assert_eq!(ebuf[1], second_chunk_expected);
}
}