ragc_core/
preprocessing.rs

1// Contig preprocessing functions matching C++ AGC genome_io logic
2
3/// Convert table matching C++ AGC's cnv_num[128]
4/// Maps ASCII characters to numeric DNA codes
5const CNV_NUM: [u8; 128] = [
6    // 0-15: Standard DNA codes
7    b'A', b'C', b'G', b'T', b'N', b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V', b'U',
8    // 16-63: Spaces
9    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
10    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
11    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
12    // 64-79: ASCII uppercase letters
13    b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
14    // 80-95: ASCII uppercase letters continued
15    30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
16    // 96-111: ASCII lowercase letters
17    b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
18    // 112-127: ASCII lowercase letters continued
19    30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
20];
21
22/// Preprocess raw contig by converting ASCII characters to numeric codes
23///
24/// Matches C++ AGC's preprocess_raw_contig() logic:
25/// - Filters bytes with high bits set (>= 64, i.e., letter characters)
26/// - Converts them using cnv_num lookup table
27/// - Removes all other characters (like spaces, newlines)
28/// - Uses loop unrolling for performance (processes 4 bytes at a time)
29///
30/// # Arguments
31/// * `contig` - Mutable vector to process in place
32///
33/// # Behavior
34/// - Modifies contig in place
35/// - Resizes to contain only converted characters
36pub fn preprocess_raw_contig(contig: &mut Vec<u8>) {
37    let len = contig.len();
38    let mut in_pos = 0usize;
39    let mut out_pos = 0usize;
40
41    // Handle remainder (len % 4) using Duff's device pattern
42    match len % 4 {
43        3 => {
44            let c = contig[in_pos];
45            in_pos += 1;
46            if c >> 6 != 0 {
47                // c >= 64
48                contig[out_pos] = CNV_NUM[c as usize];
49                out_pos += 1;
50            }
51            // Fall through to case 2
52            let c = contig[in_pos];
53            in_pos += 1;
54            if c >> 6 != 0 {
55                contig[out_pos] = CNV_NUM[c as usize];
56                out_pos += 1;
57            }
58            // Fall through to case 1
59            let c = contig[in_pos];
60            in_pos += 1;
61            if c >> 6 != 0 {
62                contig[out_pos] = CNV_NUM[c as usize];
63                out_pos += 1;
64            }
65        }
66        2 => {
67            let c = contig[in_pos];
68            in_pos += 1;
69            if c >> 6 != 0 {
70                contig[out_pos] = CNV_NUM[c as usize];
71                out_pos += 1;
72            }
73            // Fall through to case 1
74            let c = contig[in_pos];
75            in_pos += 1;
76            if c >> 6 != 0 {
77                contig[out_pos] = CNV_NUM[c as usize];
78                out_pos += 1;
79            }
80        }
81        1 => {
82            let c = contig[in_pos];
83            in_pos += 1;
84            if c >> 6 != 0 {
85                contig[out_pos] = CNV_NUM[c as usize];
86                out_pos += 1;
87            }
88        }
89        _ => {} // len % 4 == 0, nothing to do
90    }
91
92    // Process remaining bytes 4 at a time (loop unrolling)
93    while in_pos < len {
94        let c = contig[in_pos];
95        in_pos += 1;
96        if c >> 6 != 0 {
97            contig[out_pos] = CNV_NUM[c as usize];
98            out_pos += 1;
99        }
100
101        let c = contig[in_pos];
102        in_pos += 1;
103        if c >> 6 != 0 {
104            contig[out_pos] = CNV_NUM[c as usize];
105            out_pos += 1;
106        }
107
108        let c = contig[in_pos];
109        in_pos += 1;
110        if c >> 6 != 0 {
111            contig[out_pos] = CNV_NUM[c as usize];
112            out_pos += 1;
113        }
114
115        let c = contig[in_pos];
116        in_pos += 1;
117        if c >> 6 != 0 {
118            contig[out_pos] = CNV_NUM[c as usize];
119            out_pos += 1;
120        }
121    }
122
123    contig.truncate(out_pos);
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn test_preprocess_raw_contig_acgt() {
132        let mut contig = b"ACGT".to_vec();
133        preprocess_raw_contig(&mut contig);
134        assert_eq!(contig, vec![0, 1, 2, 3]);
135    }
136
137    #[test]
138    fn test_preprocess_raw_contig_with_spaces() {
139        let mut contig = b"A C G T".to_vec();
140        preprocess_raw_contig(&mut contig);
141        // Spaces (ASCII 32, < 64) should be filtered out
142        assert_eq!(contig, vec![0, 1, 2, 3]);
143    }
144
145    #[test]
146    fn test_preprocess_raw_contig_mixed() {
147        let mut contig = b"ACGTN\nATGC".to_vec();
148        preprocess_raw_contig(&mut contig);
149        // Newline (ASCII 10, < 64) should be filtered out
150        assert_eq!(contig, vec![0, 1, 2, 3, 4, 0, 3, 2, 1]);
151    }
152}