ragc_core/
kmer.rs

1// K-mer operations
2// Rust equivalent of src/core/kmer.h
3
4use ragc_common::Base;
5
6/// K-mer mode (orientation)
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum KmerMode {
9    Direct,
10    RevComp,
11    Canonical,
12}
13
14/// K-mer structure that can store both direct and reverse complement representations
15/// K-mers are stored in a compact 64-bit representation, with 2 bits per base.
16#[derive(Debug, Clone)]
17pub struct Kmer {
18    kmer_dir: u64,     // Direct k-mer
19    kmer_rc: u64,      // Reverse complement k-mer
20    cur_size: u32,     // Current size of k-mer
21    max_size: u32,     // Maximum k-mer size
22    variant: KmerMode, // Which variant to use
23    mask: u64,         // Mask for maximum k-mer size
24    shift: u32,        // Shift amount
25}
26
27impl Kmer {
28    /// Create a new empty k-mer with specified maximum size and mode
29    pub fn new(max_size: u32, variant: KmerMode) -> Self {
30        let shift = 64 - 2 * max_size;
31        let mask = (!0u64) << shift;
32
33        Self {
34            kmer_dir: 0,
35            kmer_rc: 0,
36            cur_size: 0,
37            max_size,
38            variant,
39            mask,
40            shift,
41        }
42    }
43
44    /// Create a k-mer from existing direct and reverse complement values
45    pub fn from_values(
46        kmer_dir: u64,
47        kmer_rc: u64,
48        max_size: u32,
49        cur_size: u32,
50        variant: KmerMode,
51    ) -> Self {
52        let shift = 64 - 2 * max_size;
53        let mask = (!0u64) << shift;
54
55        Self {
56            kmer_dir,
57            kmer_rc,
58            cur_size,
59            max_size,
60            variant,
61            mask,
62            shift,
63        }
64    }
65
66    /// Reset the k-mer to empty state
67    pub fn reset(&mut self) {
68        self.kmer_dir = 0;
69        self.kmer_rc = 0;
70        self.cur_size = 0;
71    }
72
73    /// Insert a symbol (base) at the end of the k-mer (canonical mode)
74    #[inline]
75    pub fn insert_canonical(&mut self, symbol: u64) {
76        // Reverse complement code
77        self.kmer_rc >>= 2;
78        self.kmer_rc += reverse_complement(symbol) << 62;
79        self.kmer_rc &= self.mask;
80
81        // Direct code
82        if self.cur_size == self.max_size {
83            self.kmer_dir <<= 2;
84            self.kmer_dir += symbol << self.shift;
85        } else {
86            self.cur_size += 1;
87            self.kmer_dir += symbol << (64 - 2 * self.cur_size);
88        }
89    }
90
91    /// Insert a symbol based on the current mode
92    #[inline]
93    pub fn insert(&mut self, symbol: u64) {
94        match self.variant {
95            KmerMode::Direct => self.insert_direct(symbol),
96            KmerMode::RevComp => self.insert_rev_comp(symbol),
97            KmerMode::Canonical => self.insert_canonical(symbol),
98        }
99    }
100
101    /// Insert a symbol (direct mode)
102    #[inline]
103    fn insert_direct(&mut self, symbol: u64) {
104        if self.cur_size == self.max_size {
105            self.kmer_dir <<= 2;
106            self.kmer_dir += symbol << self.shift;
107        } else {
108            self.cur_size += 1;
109            self.kmer_dir += symbol << (64 - 2 * self.cur_size);
110        }
111    }
112
113    /// Insert a symbol (reverse complement mode)
114    #[inline]
115    fn insert_rev_comp(&mut self, symbol: u64) {
116        self.kmer_rc >>= 2;
117        self.kmer_rc += reverse_complement(symbol) << 62;
118        self.kmer_rc &= self.mask;
119
120        if self.cur_size < self.max_size {
121            self.cur_size += 1;
122        }
123    }
124
125    /// Get the k-mer data based on current mode
126    #[inline]
127    pub fn data(&self) -> u64 {
128        match self.variant {
129            KmerMode::Direct => self.kmer_dir,
130            KmerMode::RevComp => self.kmer_rc,
131            KmerMode::Canonical => self.kmer_dir.min(self.kmer_rc),
132        }
133    }
134
135    /// Get the canonical k-mer (minimum of direct and reverse complement)
136    #[inline]
137    pub fn data_canonical(&self) -> u64 {
138        self.kmer_dir.min(self.kmer_rc)
139    }
140
141    /// Get the direct k-mer
142    #[inline]
143    pub fn data_dir(&self) -> u64 {
144        match self.variant {
145            KmerMode::Direct | KmerMode::Canonical => self.kmer_dir,
146            KmerMode::RevComp => 0,
147        }
148    }
149
150    /// Get the reverse complement k-mer
151    #[inline]
152    pub fn data_rc(&self) -> u64 {
153        match self.variant {
154            KmerMode::RevComp | KmerMode::Canonical => self.kmer_rc,
155            KmerMode::Direct => 0,
156        }
157    }
158
159    /// Check if k-mer is full (reached maximum size)
160    #[inline]
161    pub fn is_full(&self) -> bool {
162        self.cur_size == self.max_size
163    }
164
165    /// Get current size
166    #[inline]
167    pub fn get_cur_size(&self) -> u32 {
168        self.cur_size
169    }
170
171    /// Get maximum size
172    #[inline]
173    pub fn get_max_size(&self) -> u32 {
174        self.max_size
175    }
176
177    /// Get a symbol at a specific position
178    #[inline]
179    pub fn get_symbol(&self, pos: u32) -> u64 {
180        match self.variant {
181            KmerMode::Direct | KmerMode::Canonical => {
182                let sym_shift = 62 - 2 * pos;
183                (self.kmer_dir >> sym_shift) & 3
184            }
185            KmerMode::RevComp => {
186                let sym_shift = 64 - 2 * self.cur_size + 2 * pos;
187                (self.kmer_rc >> sym_shift) & 3
188            }
189        }
190    }
191
192    /// Swap direct and reverse complement representations
193    pub fn swap_dir_rc(&mut self) {
194        if self.variant == KmerMode::Canonical {
195            std::mem::swap(&mut self.kmer_dir, &mut self.kmer_rc);
196        }
197    }
198
199    /// Check if the k-mer is in direct orientation (for canonical mode)
200    pub fn is_dir_oriented(&self) -> bool {
201        if self.variant == KmerMode::Canonical {
202            self.kmer_dir <= self.kmer_rc
203        } else {
204            false
205        }
206    }
207}
208
209impl PartialEq for Kmer {
210    fn eq(&self, other: &Self) -> bool {
211        match self.variant {
212            KmerMode::Direct | KmerMode::Canonical => self.kmer_dir == other.kmer_dir,
213            KmerMode::RevComp => self.kmer_rc == other.kmer_rc,
214        }
215    }
216}
217
218/// Compute reverse complement of a 2-bit encoded base
219/// A(0) <-> T(3), C(1) <-> G(2)
220#[inline]
221pub const fn reverse_complement(base: u64) -> u64 {
222    match base {
223        0 => 3, // A -> T
224        1 => 2, // C -> G
225        2 => 1, // G -> C
226        3 => 0, // T -> A
227        _ => 4, // Invalid
228    }
229}
230
231/// Encode a character to a 2-bit value (A=0, C=1, G=2, T=3)
232#[inline]
233pub fn encode_base(c: char) -> Option<u64> {
234    Base::from_char(c).map(|b| b as u64)
235}
236
237/// Decode a 2-bit value to a character (0=A, 1=C, 2=G, 3=T)
238#[inline]
239pub fn decode_base(val: u64) -> Option<char> {
240    Base::from_u8(val as u8).map(|b| b.to_char())
241}
242
243/// Extract a k-mer from a sequence at a given position
244pub fn extract_kmer(sequence: &[u8], pos: usize, k: usize, mode: KmerMode) -> Option<Kmer> {
245    if pos + k > sequence.len() {
246        return None;
247    }
248
249    let mut kmer = Kmer::new(k as u32, mode);
250
251    for i in 0..k {
252        let base = encode_base(sequence[pos + i] as char)?;
253        kmer.insert(base);
254    }
255
256    Some(kmer)
257}
258
259/// Compute the canonical k-mer (minimum of k-mer and its reverse complement)
260#[inline]
261pub fn canonical_kmer(kmer: u64, k: u32) -> u64 {
262    let rc = reverse_complement_kmer(kmer, k);
263    kmer.min(rc)
264}
265
266/// Compute reverse complement of an entire k-mer
267pub fn reverse_complement_kmer(kmer: u64, k: u32) -> u64 {
268    let mut result = 0u64;
269    let shift = 64 - 2 * k;
270
271    for i in 0..k {
272        let base = (kmer >> (shift + 2 * i)) & 3;
273        let rc_base = reverse_complement(base);
274        result |= rc_base << (shift + 2 * (k - 1 - i));
275    }
276
277    result
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283
284    #[test]
285    fn test_reverse_complement() {
286        assert_eq!(reverse_complement(0), 3); // A -> T
287        assert_eq!(reverse_complement(1), 2); // C -> G
288        assert_eq!(reverse_complement(2), 1); // G -> C
289        assert_eq!(reverse_complement(3), 0); // T -> A
290    }
291
292    #[test]
293    fn test_encode_decode() {
294        assert_eq!(encode_base('A'), Some(0));
295        assert_eq!(encode_base('C'), Some(1));
296        assert_eq!(encode_base('G'), Some(2));
297        assert_eq!(encode_base('T'), Some(3));
298
299        assert_eq!(decode_base(0), Some('A'));
300        assert_eq!(decode_base(1), Some('C'));
301        assert_eq!(decode_base(2), Some('G'));
302        assert_eq!(decode_base(3), Some('T'));
303    }
304
305    #[test]
306    fn test_kmer_insertion() {
307        let mut kmer = Kmer::new(4, KmerMode::Canonical);
308
309        // Insert ACGT
310        kmer.insert(0); // A
311        kmer.insert(1); // C
312        kmer.insert(2); // G
313        kmer.insert(3); // T
314
315        assert!(kmer.is_full());
316        assert_eq!(kmer.get_cur_size(), 4);
317    }
318
319    #[test]
320    fn test_kmer_data() {
321        let mut kmer = Kmer::new(4, KmerMode::Direct);
322
323        // Insert AAAA (all zeros)
324        for _ in 0..4 {
325            kmer.insert(0);
326        }
327
328        // AAAA should be 0x0000... in the top bits
329        assert_eq!(kmer.data_dir() >> 56, 0);
330    }
331
332    #[test]
333    fn test_canonical_kmer() {
334        // Test that ACGT and its reverse complement ACGT give the same canonical
335        let mut kmer1 = Kmer::new(4, KmerMode::Canonical);
336        kmer1.insert(0); // A
337        kmer1.insert(1); // C
338        kmer1.insert(2); // G
339        kmer1.insert(3); // T
340
341        let canonical = kmer1.data_canonical();
342        // The canonical should be the smaller of forward and RC
343        assert!(canonical == kmer1.kmer_dir || canonical == kmer1.kmer_rc);
344        assert_eq!(canonical, kmer1.kmer_dir.min(kmer1.kmer_rc));
345    }
346}