fgumi_dna/dna.rs
1//! DNA sequence utilities.
2//!
3//! This module provides common DNA sequence operations like reverse complement.
4
5use crate::{NO_CALL_BASE, NO_CALL_BASE_LOWER};
6
7/// Complements a single DNA base, normalizing to uppercase except for N/n.
8///
9/// Returns the Watson-Crick complement: A<->T, C<->G.
10/// Both uppercase and lowercase A/T/C/G input are normalized to uppercase output.
11///
12/// # Case preservation for N/n
13///
14/// Uppercase 'N' and lowercase 'n' are returned unchanged (case preserved).
15/// This is required for fgbio compatibility in CODEC consensus calling:
16///
17/// - **CODEC** uses lowercase 'n' for padding when R1/R2 have different lengths
18/// (see `codec_caller::pad_consensus`). These padded bases are then reverse
19/// complemented for ac/bc tags (see `codec_caller::reverse_complement_ss`).
20/// fgbio preserves lowercase 'n' through this operation.
21///
22/// - **Simplex/Duplex** use only uppercase 'N' for no-call bases in production
23/// (see `vanilla_caller` lines 640, 1024, 1078 and `duplex_caller` `NO_CALL` const).
24/// The `padded_default` function with lowercase 'n' exists but is only used in tests.
25///
26/// - **`Zipper/tag_reversal`** do not create 'n' bases, only reverse complement
27/// existing tag values which come from upstream processing.
28#[inline]
29#[must_use]
30pub const fn complement_base(base: u8) -> u8 {
31 match base {
32 b'A' | b'a' => b'T',
33 b'T' | b't' => b'A',
34 b'C' | b'c' => b'G',
35 b'G' | b'g' => b'C',
36 NO_CALL_BASE => NO_CALL_BASE, // 'N' stays 'N'
37 NO_CALL_BASE_LOWER => NO_CALL_BASE_LOWER, // 'n' stays 'n' (fgbio compat)
38 _ => base,
39 }
40}
41
42/// Reverse complements a DNA sequence.
43///
44/// Returns the reverse complement of the input sequence, normalizing to uppercase.
45/// A<->T, C<->G, N and other bases are unchanged.
46///
47/// # Examples
48///
49/// ```
50/// use fgumi_dna::dna::reverse_complement;
51///
52/// assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
53/// assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
54/// assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
55/// assert_eq!(reverse_complement(b"acgt"), b"ACGT".to_vec()); // normalized to uppercase
56/// ```
57#[must_use]
58pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
59 seq.iter().rev().map(|&base| complement_base(base)).collect()
60}
61
62/// Complements a single DNA base, preserving case.
63///
64/// Returns the Watson-Crick complement: A<->T, C<->G.
65/// Unlike [`complement_base`], this preserves the case of the input:
66/// uppercase input produces uppercase output, lowercase produces lowercase.
67///
68/// # Examples
69///
70/// ```
71/// use fgumi_dna::dna::complement_base_preserve_case;
72///
73/// assert_eq!(complement_base_preserve_case(b'A'), b'T');
74/// assert_eq!(complement_base_preserve_case(b'a'), b't');
75/// ```
76#[inline]
77#[must_use]
78pub const fn complement_base_preserve_case(base: u8) -> u8 {
79 match base {
80 b'A' => b'T',
81 b'T' => b'A',
82 b'C' => b'G',
83 b'G' => b'C',
84 b'a' => b't',
85 b't' => b'a',
86 b'c' => b'g',
87 b'g' => b'c',
88 other => other,
89 }
90}
91
92/// Reverse complements a DNA string, preserving case.
93///
94/// Returns the reverse complement of the input string.
95/// Unlike [`reverse_complement`], this preserves the case of each base.
96/// Non-ASCII characters are passed through unchanged.
97///
98/// # Examples
99///
100/// ```
101/// use fgumi_dna::dna::reverse_complement_str;
102///
103/// assert_eq!(reverse_complement_str("ACGT"), "ACGT");
104/// assert_eq!(reverse_complement_str("AAAA"), "TTTT");
105/// assert_eq!(reverse_complement_str("acgt"), "acgt");
106/// ```
107#[must_use]
108pub fn reverse_complement_str(seq: &str) -> String {
109 seq.chars()
110 .rev()
111 .map(|c| if c.is_ascii() { complement_base_preserve_case(c as u8) as char } else { c })
112 .collect()
113}
114
115#[cfg(test)]
116mod tests {
117 use rstest::rstest;
118
119 use super::*;
120
121 #[test]
122 fn test_complement_base() {
123 // Uppercase bases
124 assert_eq!(complement_base(b'A'), b'T');
125 assert_eq!(complement_base(b'T'), b'A');
126 assert_eq!(complement_base(b'C'), b'G');
127 assert_eq!(complement_base(b'G'), b'C');
128
129 // Lowercase normalized to uppercase
130 assert_eq!(complement_base(b'a'), b'T');
131 assert_eq!(complement_base(b't'), b'A');
132 assert_eq!(complement_base(b'c'), b'G');
133 assert_eq!(complement_base(b'g'), b'C');
134
135 // N bases preserve case (fgbio uses lowercase 'n' for padding)
136 assert_eq!(complement_base(b'N'), b'N');
137 assert_eq!(complement_base(b'n'), b'n');
138
139 // IUPAC ambiguity codes unchanged
140 for code in [b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V'] {
141 assert_eq!(complement_base(code), code);
142 }
143
144 // Special characters unchanged
145 for c in [b'.', b'-', b'*', b'0', b'X'] {
146 assert_eq!(complement_base(c), c);
147 }
148 }
149
150 #[test]
151 fn test_reverse_complement() {
152 // Basic cases
153 assert_eq!(reverse_complement(b""), b"".to_vec());
154 assert_eq!(reverse_complement(b"A"), b"T".to_vec());
155 assert_eq!(reverse_complement(b"AAAA"), b"TTTT".to_vec());
156 assert_eq!(reverse_complement(b"TTTT"), b"AAAA".to_vec());
157 assert_eq!(reverse_complement(b"CCCC"), b"GGGG".to_vec());
158 assert_eq!(reverse_complement(b"GGGG"), b"CCCC".to_vec());
159 assert_eq!(reverse_complement(b"ACGTN"), b"NACGT".to_vec());
160
161 // Lowercase normalized to uppercase
162 assert_eq!(reverse_complement(b"acgt"), b"ACGT".to_vec());
163 assert_eq!(reverse_complement(b"AcGt"), b"ACGT".to_vec());
164
165 // Palindromic sequences
166 assert_eq!(reverse_complement(b"ACGT"), b"ACGT".to_vec());
167 assert_eq!(reverse_complement(b"GAATTC"), b"GAATTC".to_vec());
168
169 // Double operation returns original (uppercase)
170 let seq = b"ACGTACGT";
171 assert_eq!(reverse_complement(&reverse_complement(seq)), seq.to_vec());
172 }
173
174 #[rstest]
175 #[case(b'A', b'T')]
176 #[case(b'T', b'A')]
177 #[case(b'C', b'G')]
178 #[case(b'G', b'C')]
179 #[case(b'a', b't')]
180 #[case(b't', b'a')]
181 #[case(b'c', b'g')]
182 #[case(b'g', b'c')]
183 #[case(b'N', b'N')]
184 #[case(b'n', b'n')]
185 #[case(b'.', b'.')]
186 #[case(b'-', b'-')]
187 #[case(b'*', b'*')]
188 #[case(b'0', b'0')]
189 #[case(b'X', b'X')]
190 fn test_complement_base_preserve_case(#[case] input: u8, #[case] expected: u8) {
191 assert_eq!(complement_base_preserve_case(input), expected);
192 }
193
194 #[test]
195 fn test_reverse_complement_str() {
196 // Basic cases
197 assert_eq!(reverse_complement_str(""), "");
198 assert_eq!(reverse_complement_str("ACGT"), "ACGT");
199 assert_eq!(reverse_complement_str("AAAA"), "TTTT");
200
201 // Case preserved
202 assert_eq!(reverse_complement_str("acgt"), "acgt");
203 assert_eq!(reverse_complement_str("AcGt"), "aCgT");
204
205 // Special characters unchanged
206 assert_eq!(reverse_complement_str("A-C-G-T"), "A-C-G-T");
207
208 // Double operation preserves original
209 for seq in ["ACGT", "acgt", "AcGt"] {
210 assert_eq!(reverse_complement_str(&reverse_complement_str(seq)), seq);
211 }
212 }
213
214 #[test]
215 fn test_reverse_complement_str_non_ascii() {
216 // Non-ASCII characters (e.g. multi-byte UTF-8) should pass through unchanged
217 let input = "A\u{00e9}T"; // 'é' is a 2-byte UTF-8 character
218 let result = reverse_complement_str(input);
219 assert_eq!(result, "A\u{00e9}T");
220 }
221}