Skip to main content

rustalign_common/
dna.rs

1//! DNA sequence types and operations
2
3use std::fmt;
4
5/// A nucleotide (A, C, G, T, or N)
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
7#[repr(u8)]
8pub enum Nuc {
9    /// Adenine
10    A = 0,
11    /// Cytosine
12    C = 1,
13    /// Guanine
14    G = 2,
15    /// Thymine
16    T = 3,
17    /// Any nucleotide (wildcard)
18    #[default]
19    N = 4,
20}
21
22impl Nuc {
23    /// Maximum valid value for packed nucleotides
24    pub const MAX_PACKED: u8 = 0x3F; // 0b111111
25
26    /// Convert from ASCII byte
27    ///
28    /// IUPAC ambiguous codes (R, Y, M, K, S, W, H, B, V, D) are converted to N.
29    /// Lowercase versions are also accepted.
30    pub fn from_ascii(b: u8) -> Option<Nuc> {
31        match b {
32            b'A' | b'a' => Some(Nuc::A),
33            b'C' | b'c' => Some(Nuc::C),
34            b'G' | b'g' => Some(Nuc::G),
35            b'T' | b't' => Some(Nuc::T),
36            b'N' | b'n' => Some(Nuc::N),
37            // IUPAC ambiguous codes - convert to N
38            b'R' | b'r' | // A or G (purine)
39            b'Y' | b'y' | // C or T (pyrimidine)
40            b'M' | b'm' | // A or C
41            b'K' | b'k' | // G or T
42            b'S' | b's' | // G or C
43            b'W' | b'w' | // A or T
44            b'H' | b'h' | // A, C, or T
45            b'B' | b'b' | // C, G, or T
46            b'V' | b'v' | // A, C, or G
47            b'D' | b'd' | // A, G, or T
48            b'X' | b'x' | // any (same as N)
49            b'U' | b'u'   // Uracil (same as T in DNA)
50            => Some(Nuc::N),
51            _ => None,
52        }
53    }
54
55    /// Convert to ASCII byte
56    pub fn to_ascii(self) -> u8 {
57        match self {
58            Nuc::A => b'A',
59            Nuc::C => b'C',
60            Nuc::G => b'G',
61            Nuc::T => b'T',
62            Nuc::N => b'N',
63        }
64    }
65
66    /// Get complement nucleotide
67    pub fn complement(self) -> Nuc {
68        match self {
69            Nuc::A => Nuc::T,
70            Nuc::C => Nuc::G,
71            Nuc::G => Nuc::C,
72            Nuc::T => Nuc::A,
73            Nuc::N => Nuc::N,
74        }
75    }
76
77    /// Check if nucleotide is valid (not N)
78    pub fn is_valid(self) -> bool {
79        matches!(self, Nuc::A | Nuc::C | Nuc::G | Nuc::T)
80    }
81}
82
83impl fmt::Display for Nuc {
84    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
85        write!(f, "{}", self.to_ascii() as char)
86    }
87}
88
89/// A pair of nucleotides (bit-packing for FM-index)
90///
91/// Represents two nucleotides packed into a single byte:
92/// - Bits 0-1: first nucleotide
93/// - Bits 2-3: second nucleotide
94/// - Bits 4-7: unused
95#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
96#[repr(transparent)]
97pub struct NucPair(u8);
98
99impl NucPair {
100    /// Create a new nucleotide pair
101    pub fn new(lo: Nuc, hi: Nuc) -> Self {
102        NucPair((lo as u8) | ((hi as u8) << 2))
103    }
104
105    /// Get the low (first) nucleotide
106    pub fn lo(self) -> Nuc {
107        match self.0 & 0x03 {
108            0 => Nuc::A,
109            1 => Nuc::C,
110            2 => Nuc::G,
111            3 => Nuc::T,
112            _ => unreachable!(),
113        }
114    }
115
116    /// Get the high (second) nucleotide
117    pub fn hi(self) -> Nuc {
118        match (self.0 >> 2) & 0x03 {
119            0 => Nuc::A,
120            1 => Nuc::C,
121            2 => Nuc::G,
122            3 => Nuc::T,
123            _ => unreachable!(),
124        }
125    }
126
127    /// Get raw byte value
128    pub fn as_u8(self) -> u8 {
129        self.0
130    }
131
132    /// Create from raw byte
133    pub fn from_u8(b: u8) -> Self {
134        NucPair(b & 0x0F)
135    }
136}
137
138/// Reverse complement a DNA sequence slice
139pub fn revcomp(seq: &[Nuc]) -> Vec<Nuc> {
140    seq.iter().rev().map(|&n| n.complement()).collect()
141}
142
143/// Reverse complement a byte slice in-place
144#[allow(dead_code)]
145pub fn revcomp_in_place(seq: &mut [Nuc]) {
146    seq.reverse();
147    for n in seq.iter_mut() {
148        *n = n.complement();
149    }
150}
151
152/// Trait for types that can be reverse complemented
153pub trait RevComp {
154    /// Output type after reverse complement
155    type Output;
156
157    /// Reverse complement this value
158    fn revcomp(&self) -> Self::Output;
159}
160
161impl RevComp for [Nuc] {
162    type Output = Vec<Nuc>;
163
164    fn revcomp(&self) -> Self::Output {
165        revcomp(self)
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    #[test]
174    fn test_nuc_roundtrip() {
175        assert_eq!(Nuc::from_ascii(b'A'), Some(Nuc::A));
176        assert_eq!(Nuc::from_ascii(b'c'), Some(Nuc::C));
177        assert_eq!(Nuc::from_ascii(b'G'), Some(Nuc::G));
178        assert_eq!(Nuc::from_ascii(b't'), Some(Nuc::T));
179        assert_eq!(Nuc::from_ascii(b'N'), Some(Nuc::N));
180        // Ambiguous codes convert to N
181        assert_eq!(Nuc::from_ascii(b'X'), Some(Nuc::N));
182        assert_eq!(Nuc::from_ascii(b'R'), Some(Nuc::N));
183        assert_eq!(Nuc::from_ascii(b'Y'), Some(Nuc::N));
184        // Invalid characters return None
185        assert_eq!(Nuc::from_ascii(b'!'), None);
186        assert_eq!(Nuc::from_ascii(b' '), None);
187    }
188
189    #[test]
190    fn test_nuc_complement() {
191        assert_eq!(Nuc::A.complement(), Nuc::T);
192        assert_eq!(Nuc::C.complement(), Nuc::G);
193        assert_eq!(Nuc::G.complement(), Nuc::C);
194        assert_eq!(Nuc::T.complement(), Nuc::A);
195        assert_eq!(Nuc::N.complement(), Nuc::N);
196    }
197
198    #[test]
199    fn test_nucpair() {
200        let pair = NucPair::new(Nuc::A, Nuc::C);
201        assert_eq!(pair.lo(), Nuc::A);
202        assert_eq!(pair.hi(), Nuc::C);
203        assert_eq!(pair.as_u8(), 0x04); // A=0, C=1 << 2
204    }
205
206    #[test]
207    fn test_revcomp() {
208        let seq = [Nuc::A, Nuc::C, Nuc::G, Nuc::T];
209        let rc = seq.revcomp();
210        assert_eq!(rc, vec![Nuc::A, Nuc::C, Nuc::G, Nuc::T]); // ACGT revcomp = ACGT
211    }
212}