use crate::error::KmerError;
const A: u8 = 0; const C: u8 = 1; const G: u8 = 2; const T: u8 = 3;
pub const MAX_KMER_SIZE_IN_U64: usize = 32;
pub const MAX_KMER_SIZE_IN_U128: usize = 64;
pub fn encode_kmer(sequence: &str) -> Result<u64, KmerError> {
encode_kmer_bytes(sequence.as_bytes())
}
pub fn encode_kmer_bytes(sequence: &[u8]) -> Result<u64, KmerError> {
if sequence.len() > MAX_KMER_SIZE_IN_U64 {
return Err(KmerError::InvalidKmerSize(
(sequence.len() as u32).try_into().unwrap_or(u32::MAX),
));
}
let mut encoded = 0u64;
let mut pos = sequence.len();
for (i, &byte) in sequence.iter().enumerate() {
let base = match byte.to_ascii_uppercase() {
b'A' => A,
b'C' => C,
b'G' => G,
b'T' => T,
_ => {
return Err(KmerError::InvalidCharacter {
pos: i,
char: char::from(byte),
})
}
};
encoded <<= 2;
encoded |= base as u64;
pos -= 1;
}
encoded <<= pos * 2;
Ok(encoded)
}
pub fn decode_kmer(encoded: u64, length: usize) -> String {
if length > MAX_KMER_SIZE_IN_U64 {
return String::new();
}
let mut result = String::with_capacity(length);
let bits_to_shift = (64 - (length * 2)) as u32;
let mut encoded = encoded << bits_to_shift;
for _ in 0..length {
let base = (encoded >> 62) & 0b11;
let char = match base {
0 => 'A',
1 => 'C',
2 => 'G',
3 => 'T',
_ => 'N', };
result.push(char);
encoded <<= 2;
}
result
}
pub fn reverse_complement(encoded: u64, length: usize) -> u64 {
if length > MAX_KMER_SIZE_IN_U64 {
return 0;
}
let mut rc = 0u64;
let mut temp_encoded = encoded;
for _ in 0..length {
let base = temp_encoded & 0b11;
let complement = 3 - base; rc <<= 2;
rc |= complement;
temp_encoded >>= 2;
}
rc
}
pub fn validate_sequence(sequence: &str) -> bool {
sequence
.chars()
.all(|ch| matches!(ch.to_ascii_uppercase(), 'A' | 'C' | 'G' | 'T'))
}
pub fn has_ambiguous_bases(sequence: &str) -> bool {
!sequence
.chars()
.all(|ch| matches!(ch.to_ascii_uppercase(), 'A' | 'C' | 'G' | 'T'))
}
pub fn encode_kmer_u128(sequence: &str) -> Result<u128, KmerError> {
encode_kmer_bytes_u128(sequence.as_bytes())
}
pub fn encode_kmer_bytes_u128(sequence: &[u8]) -> Result<u128, KmerError> {
if sequence.is_empty() || sequence.len() > MAX_KMER_SIZE_IN_U128 {
return Err(KmerError::InvalidKmerSize(
(sequence.len() as u32).try_into().unwrap_or(u32::MAX),
));
}
let mut encoded: u128 = 0;
let length = sequence.len();
let bits_to_shift = (128 - (length * 2)) as u32;
encoded <<= bits_to_shift;
for &byte in sequence {
let base = byte.to_ascii_uppercase();
let value = match base {
b'A' => A,
b'C' => C,
b'G' => G,
b'T' => T,
_ => {
return Err(KmerError::InvalidCharacter {
pos: encoded.trailing_zeros() as usize / 2,
char: byte as char,
});
}
};
encoded = (encoded << 2) | (value as u128);
}
Ok(encoded)
}
pub fn decode_kmer_u128(encoded: u128, length: usize) -> String {
if length == 0 || length > MAX_KMER_SIZE_IN_U128 {
return String::new();
}
let mut result = String::with_capacity(length);
let bits_to_shift = (128 - (length * 2)) as u32;
let mut encoded = encoded << bits_to_shift;
for _ in 0..length {
let base = (encoded >> 126) & 0b11;
let char = match base {
0 => 'A',
1 => 'C',
2 => 'G',
3 => 'T',
_ => 'N', };
result.push(char);
encoded <<= 2;
}
result
}
pub fn encode_kmer_auto(sequence: &str) -> (u128, u8, bool) {
if sequence.len() <= MAX_KMER_SIZE_IN_U64 {
let encoded_u64 = encode_kmer_u64(sequence).unwrap_or(0);
(encoded_u64 as u128, sequence.len() as u8, false)
} else {
let encoded_u128 = encode_kmer_u128(sequence).unwrap_or(0);
(encoded_u128, sequence.len() as u8, true)
}
}
pub fn encode_kmer_u64(sequence: &str) -> Result<u64, KmerError> {
encode_kmer(sequence)
}
pub fn decode_kmer_u64(encoded: u64, length: usize) -> String {
decode_kmer(encoded, length)
}
pub fn reverse_complement_u128(encoded: u128, length: usize) -> u128 {
if length > MAX_KMER_SIZE_IN_U128 {
return 0;
}
let mut rc = 0u128;
let mut temp_encoded = encoded;
for _ in 0..length {
let base = temp_encoded & 0b11;
let complement = 3 - base; rc <<= 2;
rc |= complement;
temp_encoded >>= 2;
}
rc
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_decode_simple() {
let sequence = "ATGC";
let encoded = encode_kmer(sequence).unwrap();
let decoded = decode_kmer(encoded, sequence.len());
assert_eq!(decoded, sequence);
}
#[test]
fn test_encode_decode_longer() {
let sequence = "ATGCGATGCGATGCGATGCGATGCGATGCGATGC";
let encoded = encode_kmer_u128(sequence).unwrap();
let decoded = decode_kmer_u128(encoded, sequence.len());
assert_eq!(decoded, sequence);
}
#[test]
fn test_reverse_complement() {
let sequence = "ATGC";
let encoded = encode_kmer(sequence).unwrap();
let rc = reverse_complement(encoded, sequence.len());
let rc_decoded = decode_kmer(rc, sequence.len());
assert_eq!(rc_decoded, "GCAT");
}
#[test]
fn test_invalid_character() {
let result = encode_kmer("ATXG");
assert!(result.is_err());
if let Err(KmerError::InvalidCharacter { pos, char: c }) = result {
assert_eq!(pos, 2);
assert_eq!(c, 'X');
} else {
panic!("Expected InvalidCharacter error");
}
}
#[test]
fn test_too_long_kmer() {
let long_seq = "A".repeat(33);
let result = encode_kmer(&long_seq);
assert!(result.is_err());
}
#[test]
fn test_validate_sequence() {
assert!(validate_sequence("ATGC"));
assert!(validate_sequence("atgc"));
assert!(!validate_sequence("ATXG"));
assert!(validate_sequence("")); }
#[test]
fn test_has_ambiguous_bases() {
assert!(!has_ambiguous_bases("ATGC"));
assert!(has_ambiguous_bases("ATNG"));
assert!(!has_ambiguous_bases("")); }
#[test]
fn test_canonical_property() {
let sequence = "ATGCGAT";
let encoded = encode_kmer(sequence).unwrap();
let rc = reverse_complement(encoded, sequence.len());
let rc_rc = reverse_complement(rc, sequence.len());
assert_eq!(encoded, rc_rc);
}
#[test]
fn test_bit_patterns() {
let encoded_a = encode_kmer("A").unwrap();
let encoded_t = encode_kmer("T").unwrap();
let encoded_g = encode_kmer("G").unwrap();
let encoded_c = encode_kmer("C").unwrap();
assert_eq!((encoded_a & 0b11) as u8, A);
assert_eq!((encoded_t & 0b11) as u8, T);
assert_eq!((encoded_g & 0b11) as u8, G);
assert_eq!((encoded_c & 0b11) as u8, C);
}
}