1use crate::{KmerError, Result};
2
3pub type Kmer = u64;
4
5const MAX_K: usize = 32;
6
7pub const fn base_bits(b: u8) -> Option<u64> {
8 match b {
9 b'A' | b'a' => Some(0b00),
10 b'C' | b'c' => Some(0b01),
11 b'G' | b'g' => Some(0b10),
12 b'T' | b't' => Some(0b11),
13 _ => None,
14 }
15}
16
17const fn bits_base(bits: u64) -> u8 {
18 match bits & 0b11 {
19 0b00 => b'A',
20 0b01 => b'C',
21 0b10 => b'G',
22 _ => b'T',
23 }
24}
25
26pub fn encode(seq: &[u8]) -> Result<Kmer> {
27 let k = seq.len();
28 if !(1..=MAX_K).contains(&k) {
29 return Err(KmerError::KOutOfRange(k));
30 }
31 let mut bits: u64 = 0;
32 for (i, &b) in seq.iter().enumerate() {
33 let v = base_bits(b).ok_or(KmerError::NonAcgt { pos: i, byte: b })?;
34 bits = (bits << 2) | v;
35 }
36 Ok(bits)
37}
38
39#[must_use]
40pub fn decode(kmer: Kmer, k: usize) -> Vec<u8> {
41 let mut out = vec![0u8; k];
42 let mut bits = kmer;
43 for slot in out.iter_mut().rev() {
44 *slot = bits_base(bits);
45 bits >>= 2;
46 }
47 out
48}
49
50#[must_use]
51pub fn reverse_complement(kmer: Kmer, k: usize) -> Kmer {
52 let mut bits = kmer;
53 let comp = bits ^ ((1u64 << (2 * k)) - 1); let mut rc: u64 = 0;
55 bits = comp;
56 for _ in 0..k {
57 rc = (rc << 2) | (bits & 0b11);
58 bits >>= 2;
59 }
60 rc
61}
62
63#[must_use]
64pub fn canonical(kmer: Kmer, k: usize) -> Kmer {
65 let rc = reverse_complement(kmer, k);
66 kmer.min(rc)
67}
68
69#[cfg(test)]
70mod tests {
71 use super::*;
72
73 #[test]
74 fn encode_round_trip() {
75 let k = 8;
76 let seq = b"ACGTACGT";
77 let bits = encode(seq).unwrap();
78 let back = decode(bits, k);
79 assert_eq!(&back, seq);
80 }
81
82 #[test]
83 fn encode_rejects_n() {
84 let r = encode(b"ACGTN");
85 assert!(matches!(r, Err(KmerError::NonAcgt { pos: 4, .. })));
86 }
87
88 #[test]
89 fn encode_k_too_large_rejected() {
90 let seq = vec![b'A'; 33];
91 assert!(matches!(encode(&seq), Err(KmerError::KOutOfRange(33))));
92 }
93
94 #[test]
95 fn rc_of_rc_is_identity() {
96 let bits = encode(b"ACGTACGT").unwrap();
97 let rc = reverse_complement(bits, 8);
98 let back = reverse_complement(rc, 8);
99 assert_eq!(back, bits);
100 }
101
102 #[test]
103 fn rc_known_value() {
104 assert_eq!(
105 decode(reverse_complement(encode(b"AAAA").unwrap(), 4), 4),
106 b"TTTT".to_vec()
107 );
108 assert_eq!(
109 decode(reverse_complement(encode(b"ACGT").unwrap(), 4), 4),
110 b"ACGT".to_vec()
111 );
112 }
113
114 #[test]
115 fn canonical_picks_lex_min_of_pair() {
116 let fwd = encode(b"GGGG").unwrap();
117 let rc = reverse_complement(fwd, 4);
118 assert_eq!(canonical(fwd, 4), rc);
119 assert_eq!(canonical(fwd, 4), canonical(rc, 4));
120 }
121}