Skip to main content

fgumi_dna/
dna.rs

1//! DNA sequence utilities.
2//!
3//! This module provides common DNA sequence operations like reverse complement.
4
5use crate::{NO_CALL_BASE, NO_CALL_BASE_LOWER};
6
7/// Complements a single DNA base, normalizing to uppercase except for N/n.
8///
9/// Returns the Watson-Crick complement: A<->T, C<->G.
10/// Both uppercase and lowercase A/T/C/G input are normalized to uppercase output.
11///
12/// # Case preservation for N/n
13///
14/// Uppercase 'N' and lowercase 'n' are returned unchanged (case preserved).
15/// This is required for fgbio compatibility in CODEC consensus calling:
16///
17/// - **CODEC** uses lowercase 'n' for padding when R1/R2 have different lengths
18///   (see `codec_caller::pad_consensus`). These padded bases are then reverse
19///   complemented for ac/bc tags (see `codec_caller::reverse_complement_ss`).
20///   fgbio preserves lowercase 'n' through this operation.
21///
22/// - **Simplex/Duplex** use only uppercase 'N' for no-call bases in production
23///   (see `vanilla_caller` lines 640, 1024, 1078 and `duplex_caller` `NO_CALL` const).
24///   The `padded_default` function with lowercase 'n' exists but is only used in tests.
25///
26/// - **`Zipper/tag_reversal`** do not create 'n' bases, only reverse complement
27///   existing tag values which come from upstream processing.
28#[inline]
29#[must_use]
30pub const fn complement_base(base: u8) -> u8 {
31    match base {
32        b'A' | b'a' => b'T',
33        b'T' | b't' => b'A',
34        b'C' | b'c' => b'G',
35        b'G' | b'g' => b'C',
36        NO_CALL_BASE => NO_CALL_BASE,             // 'N' stays 'N'
37        NO_CALL_BASE_LOWER => NO_CALL_BASE_LOWER, // 'n' stays 'n' (fgbio compat)
38        _ => base,
39    }
40}
41
42/// Reverse complements a DNA sequence.
43///
44/// Returns the reverse complement of the input sequence, normalizing to uppercase.
45/// A<->T, C<->G, N and other bases are unchanged.
46///
47/// # Examples
48///
49/// ```
50/// use fgumi_dna::dna::reverse_complement;
51///
52/// assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
53/// assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
54/// assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
55/// assert_eq!(reverse_complement(b"acgt"), b"ACGT".to_vec());  // normalized to uppercase
56/// ```
57#[must_use]
58pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
59    seq.iter().rev().map(|&base| complement_base(base)).collect()
60}
61
62/// Complements a single DNA base, preserving case.
63///
64/// Returns the Watson-Crick complement: A<->T, C<->G.
65/// Unlike [`complement_base`], this preserves the case of the input:
66/// uppercase input produces uppercase output, lowercase produces lowercase.
67///
68/// # Examples
69///
70/// ```
71/// use fgumi_dna::dna::complement_base_preserve_case;
72///
73/// assert_eq!(complement_base_preserve_case(b'A'), b'T');
74/// assert_eq!(complement_base_preserve_case(b'a'), b't');
75/// ```
76#[inline]
77#[must_use]
78pub const fn complement_base_preserve_case(base: u8) -> u8 {
79    match base {
80        b'A' => b'T',
81        b'T' => b'A',
82        b'C' => b'G',
83        b'G' => b'C',
84        b'a' => b't',
85        b't' => b'a',
86        b'c' => b'g',
87        b'g' => b'c',
88        other => other,
89    }
90}
91
92/// Reverse complements a DNA string, preserving case.
93///
94/// Returns the reverse complement of the input string.
95/// Unlike [`reverse_complement`], this preserves the case of each base.
96/// Non-ASCII characters are passed through unchanged.
97///
98/// # Examples
99///
100/// ```
101/// use fgumi_dna::dna::reverse_complement_str;
102///
103/// assert_eq!(reverse_complement_str("ACGT"), "ACGT");
104/// assert_eq!(reverse_complement_str("AAAA"), "TTTT");
105/// assert_eq!(reverse_complement_str("acgt"), "acgt");
106/// ```
107#[must_use]
108pub fn reverse_complement_str(seq: &str) -> String {
109    seq.chars()
110        .rev()
111        .map(|c| if c.is_ascii() { complement_base_preserve_case(c as u8) as char } else { c })
112        .collect()
113}
114
115#[cfg(test)]
116mod tests {
117    use rstest::rstest;
118
119    use super::*;
120
121    #[test]
122    fn test_complement_base() {
123        // Uppercase bases
124        assert_eq!(complement_base(b'A'), b'T');
125        assert_eq!(complement_base(b'T'), b'A');
126        assert_eq!(complement_base(b'C'), b'G');
127        assert_eq!(complement_base(b'G'), b'C');
128
129        // Lowercase normalized to uppercase
130        assert_eq!(complement_base(b'a'), b'T');
131        assert_eq!(complement_base(b't'), b'A');
132        assert_eq!(complement_base(b'c'), b'G');
133        assert_eq!(complement_base(b'g'), b'C');
134
135        // N bases preserve case (fgbio uses lowercase 'n' for padding)
136        assert_eq!(complement_base(b'N'), b'N');
137        assert_eq!(complement_base(b'n'), b'n');
138
139        // IUPAC ambiguity codes unchanged
140        for code in [b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V'] {
141            assert_eq!(complement_base(code), code);
142        }
143
144        // Special characters unchanged
145        for c in [b'.', b'-', b'*', b'0', b'X'] {
146            assert_eq!(complement_base(c), c);
147        }
148    }
149
150    #[test]
151    fn test_reverse_complement() {
152        // Basic cases
153        assert_eq!(reverse_complement(b""), b"".to_vec());
154        assert_eq!(reverse_complement(b"A"), b"T".to_vec());
155        assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
156        assert_eq!(reverse_complement(b"TTTT"), b"AAAA".to_vec());
157        assert_eq!(reverse_complement(b"CCCC"), b"GGGG".to_vec());
158        assert_eq!(reverse_complement(b"GGGG"), b"CCCC".to_vec());
159        assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
160
161        // Lowercase normalized to uppercase
162        assert_eq!(reverse_complement(b"acgt"), b"ACGT".to_vec());
163        assert_eq!(reverse_complement(b"AcGt"), b"ACGT".to_vec());
164
165        // Palindromic sequences
166        assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
167        assert_eq!(reverse_complement(b"GAATTC"), b"GAATTC".to_vec());
168
169        // Double operation returns original (uppercase)
170        let seq = b"ACGTACGT";
171        assert_eq!(reverse_complement(&reverse_complement(seq)), seq.to_vec());
172    }
173
174    #[rstest]
175    #[case(b'A', b'T')]
176    #[case(b'T', b'A')]
177    #[case(b'C', b'G')]
178    #[case(b'G', b'C')]
179    #[case(b'a', b't')]
180    #[case(b't', b'a')]
181    #[case(b'c', b'g')]
182    #[case(b'g', b'c')]
183    #[case(b'N', b'N')]
184    #[case(b'n', b'n')]
185    #[case(b'.', b'.')]
186    #[case(b'-', b'-')]
187    #[case(b'*', b'*')]
188    #[case(b'0', b'0')]
189    #[case(b'X', b'X')]
190    fn test_complement_base_preserve_case(#[case] input: u8, #[case] expected: u8) {
191        assert_eq!(complement_base_preserve_case(input), expected);
192    }
193
194    #[test]
195    fn test_reverse_complement_str() {
196        // Basic cases
197        assert_eq!(reverse_complement_str(""), "");
198        assert_eq!(reverse_complement_str("ACGT"), "ACGT");
199        assert_eq!(reverse_complement_str("AAAA"), "TTTT");
200
201        // Case preserved
202        assert_eq!(reverse_complement_str("acgt"), "acgt");
203        assert_eq!(reverse_complement_str("AcGt"), "aCgT");
204
205        // Special characters unchanged
206        assert_eq!(reverse_complement_str("A-C-G-T"), "A-C-G-T");
207
208        // Double operation preserves original
209        for seq in ["ACGT", "acgt", "AcGt"] {
210            assert_eq!(reverse_complement_str(&reverse_complement_str(seq)), seq);
211        }
212    }
213
214    #[test]
215    fn test_reverse_complement_str_non_ascii() {
216        // Non-ASCII characters (e.g. multi-byte UTF-8) should pass through unchanged
217        let input = "A\u{00e9}T"; // 'é' is a 2-byte UTF-8 character
218        let result = reverse_complement_str(input);
219        assert_eq!(result, "A\u{00e9}T");
220    }
221}