Skip to main content

rsomics_kmer/
encode.rs

1use crate::{KmerError, Result};
2
3pub type Kmer = u64;
4
5const MAX_K: usize = 32;
6
7pub const fn base_bits(b: u8) -> Option<u64> {
8    match b {
9        b'A' | b'a' => Some(0b00),
10        b'C' | b'c' => Some(0b01),
11        b'G' | b'g' => Some(0b10),
12        b'T' | b't' => Some(0b11),
13        _ => None,
14    }
15}
16
17const fn bits_base(bits: u64) -> u8 {
18    match bits & 0b11 {
19        0b00 => b'A',
20        0b01 => b'C',
21        0b10 => b'G',
22        _ => b'T',
23    }
24}
25
26pub fn encode(seq: &[u8]) -> Result<Kmer> {
27    let k = seq.len();
28    if !(1..=MAX_K).contains(&k) {
29        return Err(KmerError::KOutOfRange(k));
30    }
31    let mut bits: u64 = 0;
32    for (i, &b) in seq.iter().enumerate() {
33        let v = base_bits(b).ok_or(KmerError::NonAcgt { pos: i, byte: b })?;
34        bits = (bits << 2) | v;
35    }
36    Ok(bits)
37}
38
39#[must_use]
40pub fn decode(kmer: Kmer, k: usize) -> Vec<u8> {
41    let mut out = vec![0u8; k];
42    let mut bits = kmer;
43    for slot in out.iter_mut().rev() {
44        *slot = bits_base(bits);
45        bits >>= 2;
46    }
47    out
48}
49
50#[must_use]
51pub fn reverse_complement(kmer: Kmer, k: usize) -> Kmer {
52    let mut bits = kmer;
53    let comp = bits ^ ((1u64 << (2 * k)) - 1); // XOR with all-1s per 2-bit pair: A↔T, C↔G
54    let mut rc: u64 = 0;
55    bits = comp;
56    for _ in 0..k {
57        rc = (rc << 2) | (bits & 0b11);
58        bits >>= 2;
59    }
60    rc
61}
62
63#[must_use]
64pub fn canonical(kmer: Kmer, k: usize) -> Kmer {
65    let rc = reverse_complement(kmer, k);
66    kmer.min(rc)
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72
73    #[test]
74    fn encode_round_trip() {
75        let k = 8;
76        let seq = b"ACGTACGT";
77        let bits = encode(seq).unwrap();
78        let back = decode(bits, k);
79        assert_eq!(&back, seq);
80    }
81
82    #[test]
83    fn encode_rejects_n() {
84        let r = encode(b"ACGTN");
85        assert!(matches!(r, Err(KmerError::NonAcgt { pos: 4, .. })));
86    }
87
88    #[test]
89    fn encode_k_too_large_rejected() {
90        let seq = vec![b'A'; 33];
91        assert!(matches!(encode(&seq), Err(KmerError::KOutOfRange(33))));
92    }
93
94    #[test]
95    fn rc_of_rc_is_identity() {
96        let bits = encode(b"ACGTACGT").unwrap();
97        let rc = reverse_complement(bits, 8);
98        let back = reverse_complement(rc, 8);
99        assert_eq!(back, bits);
100    }
101
102    #[test]
103    fn rc_known_value() {
104        assert_eq!(
105            decode(reverse_complement(encode(b"AAAA").unwrap(), 4), 4),
106            b"TTTT".to_vec()
107        );
108        assert_eq!(
109            decode(reverse_complement(encode(b"ACGT").unwrap(), 4), 4),
110            b"ACGT".to_vec()
111        );
112    }
113
114    #[test]
115    fn canonical_picks_lex_min_of_pair() {
116        let fwd = encode(b"GGGG").unwrap();
117        let rc = reverse_complement(fwd, 4);
118        assert_eq!(canonical(fwd, 4), rc);
119        assert_eq!(canonical(fwd, 4), canonical(rc, 4));
120    }
121}