ragc_core/
genome_io.rs

1// FASTA/FASTQ genome I/O
2// Rust equivalent of genome_io.cpp
3
4#![allow(dead_code)]
5
6use std::fs::File;
7use std::io::{self, BufRead, BufReader, Read, Write};
8use std::path::Path;
9
10use flate2::read::MultiGzDecoder;
11use ragc_common::Contig;
12
13/// Nucleotide conversion table matching C++ cnv_num
14/// Maps ASCII characters to numeric representation
15pub(crate) const CNV_NUM: [u8; 128] = [
16    // 0-15: Control characters -> ACGT... (matching C++)
17    b'A', b'C', b'G', b'T', b'N', b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V', b'U',
18    // 16-63: Unused
19    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
20    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
21    b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
22    // 64-79: Upper case letters starting at 'A' (65)
23    b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
24    // 80-95: Upper case continued
25    30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
26    // 96-111: Lower case letters starting at 'a' (97)
27    b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
28    // 112-127: Lower case continued
29    30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
30];
31
32/// Parse sample name and contig name from FASTA header
33///
34/// Format: >sample#haplotype#chromosome_part or >sample#haplotype#chromosome
35/// Examples:
36///   "S288C#1#chrI" -> ("S288C#1", "chrI")
37///   "AAA#0#chrI_1" -> ("AAA#0", "chrI_1")
38///   "simple_name" -> ("unknown", "simple_name")
39///
40pub fn parse_sample_from_header(header: &str) -> (String, String) {
41    // Split by '#'
42    let parts: Vec<&str> = header.split('#').collect();
43
44    if parts.len() >= 3 {
45        // Format: sample#haplotype#chromosome
46        let sample_name = format!("{}#{}", parts[0], parts[1]);
47        let contig_name = parts[2..].join("#"); // Join remaining parts (handles chr#extra)
48        (sample_name, contig_name)
49    } else {
50        // No multi-sample format detected, use entire header as contig name
51        ("unknown".to_string(), header.to_string())
52    }
53}
54
55/// FASTA/FASTQ parser
56pub struct GenomeIO<R> {
57    reader: Option<BufReader<R>>,
58    buffer: Vec<u8>,
59    next_header: Option<Vec<u8>>, // Buffered header line when we read ahead
60}
61
62impl<R: Read> GenomeIO<R> {
63    /// Create a new genome reader
64    pub fn new(reader: R) -> Self {
65        GenomeIO {
66            reader: Some(BufReader::with_capacity(4 << 20, reader)),
67            buffer: Vec::with_capacity(4 << 20),
68            next_header: None,
69        }
70    }
71
72    /// Read next contig without conversion (preserves ASCII)
73    pub fn read_contig(&mut self) -> io::Result<Option<(String, Contig)>> {
74        self.read_contig_impl(false)
75    }
76
77    /// Read next contig with nucleotide conversion (ASCII -> numeric)
78    pub fn read_contig_converted(&mut self) -> io::Result<Option<(String, Contig)>> {
79        self.read_contig_impl(true)
80    }
81
82    /// Read next contig with parsed sample/contig names (for multi-sample FASTAs)
83    /// Returns: (full_header, sample_name, contig_name, sequence)
84    ///
85    /// Parses headers in format: >sample#haplotype#chromosome
86    /// Example: >S288C#1#chrI -> sample="S288C#1", contig="chrI"
87    pub fn read_contig_with_sample(
88        &mut self,
89    ) -> io::Result<Option<(String, String, String, Contig)>> {
90        let (full_header, sequence) = match self.read_contig_impl(true)? {
91            Some(result) => result,
92            None => return Ok(None),
93        };
94
95        // Parse sample name from header
96        let (sample_name, contig_name) = parse_sample_from_header(&full_header);
97
98        Ok(Some((full_header, sample_name, contig_name, sequence)))
99    }
100
101    /// Read next contig preserving raw format (including newlines)
102    pub fn read_contig_raw(&mut self) -> io::Result<Option<(String, Contig)>> {
103        let reader = match &mut self.reader {
104            Some(r) => r,
105            None => return Ok(None),
106        };
107
108        let mut contig = Contig::new();
109
110        // Read ID line (starts with '>')
111        // Check if we have a buffered header from previous read
112        let header_line = if let Some(buffered) = self.next_header.take() {
113            buffered
114        } else {
115            self.buffer.clear();
116            let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
117            if bytes_read == 0 {
118                return Ok(None);
119            }
120            self.buffer.clone()
121        };
122
123        // Extract ID (skip '>' and trim whitespace)
124        let id_line = String::from_utf8_lossy(&header_line);
125        let id = id_line.trim_start_matches('>').trim().to_string();
126
127        // Read sequence data until next '>' or EOF
128        loop {
129            self.buffer.clear();
130            let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
131
132            if bytes_read == 0 {
133                // EOF reached
134                break;
135            }
136
137            // Check if this is the start of a new contig
138            if !self.buffer.is_empty() && self.buffer[0] == b'>' {
139                // Save this header for the next read
140                self.next_header = Some(self.buffer.clone());
141                break;
142            }
143
144            // Append sequence data
145            contig.extend_from_slice(&self.buffer);
146        }
147
148        if id.is_empty() || contig.is_empty() {
149            return Ok(None);
150        }
151
152        Ok(Some((id, contig)))
153    }
154
155    /// Internal implementation of contig reading with optional conversion
156    fn read_contig_impl(&mut self, converted: bool) -> io::Result<Option<(String, Contig)>> {
157        // First read raw
158        let (id, raw_contig) = match self.read_contig_raw()? {
159            Some(result) => result,
160            None => return Ok(None),
161        };
162
163        // Filter and optionally convert
164        let mut contig = Contig::with_capacity(raw_contig.len());
165
166        for &c in &raw_contig {
167            // Only keep characters > 64 (ASCII 'A' is 65)
168            if c > 64 && (c as usize) < CNV_NUM.len() {
169                if converted {
170                    contig.push(CNV_NUM[c as usize]);
171                } else {
172                    contig.push(c);
173                }
174            }
175        }
176
177        Ok(Some((id, contig)))
178    }
179}
180
181impl GenomeIO<File> {
182    /// Open a FASTA file for reading (uncompressed files only)
183    fn open_raw<P: AsRef<Path>>(path: P) -> io::Result<Self> {
184        let file = File::open(path)?;
185        Ok(GenomeIO::new(file))
186    }
187}
188
189impl GenomeIO<Box<dyn Read>> {
190    /// Open a FASTA file for reading (supports .gz files)
191    pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
192        let path = path.as_ref();
193        let file = File::open(path)?;
194
195        // Check if file is gzipped by extension
196        let reader: Box<dyn Read> = if path.extension().and_then(|s| s.to_str()) == Some("gz") {
197            // Use MultiGzDecoder to handle BGZIP (multi-member gzip) files
198            Box::new(MultiGzDecoder::new(file))
199        } else {
200            Box::new(file)
201        };
202
203        Ok(GenomeIO::new(reader))
204    }
205}
206
207/// Writer for FASTA files
208pub struct GenomeWriter<W> {
209    writer: W,
210}
211
212impl<W: Write> GenomeWriter<W> {
213    /// Create a new genome writer
214    pub fn new(writer: W) -> Self {
215        GenomeWriter { writer }
216    }
217
218    /// Save a contig directly (without line wrapping)
219    /// If gzip_level > 0, the header is gzip-compressed but sequence is raw
220    pub fn save_contig_directly(
221        &mut self,
222        id: &str,
223        contig: &Contig,
224        _gzip_level: u32,
225    ) -> io::Result<()> {
226        // For now, ignore gzip_level and write directly
227        // Full gzip support would require the refresh library or similar
228        writeln!(self.writer, ">{id}")?;
229
230        // Write sequence with line wrapping at 80 characters (matching C++ AGC behavior)
231        const LINE_WIDTH: usize = 80;
232        for chunk in contig.chunks(LINE_WIDTH) {
233            self.writer.write_all(chunk)?;
234            writeln!(self.writer)?;
235        }
236
237        Ok(())
238    }
239}
240
241impl GenomeWriter<File> {
242    /// Open a FASTA file for writing
243    pub fn create<P: AsRef<Path>>(path: P) -> io::Result<Self> {
244        let file = File::create(path)?;
245        Ok(GenomeWriter::new(file))
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252    use std::io::Cursor;
253
254    #[test]
255    fn test_read_simple_fasta() {
256        let fasta_data = b">seq1\nACGT\n>seq2\nTGCA\n";
257        let mut reader = GenomeIO::new(Cursor::new(fasta_data));
258
259        // Read first contig
260        let (id, seq) = reader.read_contig().unwrap().unwrap();
261        assert_eq!(id, "seq1");
262        assert_eq!(seq, b"ACGT");
263
264        // Read second contig
265        let (id, seq) = reader.read_contig().unwrap().unwrap();
266        assert_eq!(id, "seq2");
267        assert_eq!(seq, b"TGCA");
268
269        // No more contigs
270        assert!(reader.read_contig().unwrap().is_none());
271    }
272
273    #[test]
274    fn test_read_converted_fasta() {
275        let fasta_data = b">test\nACGT\n";
276        let mut reader = GenomeIO::new(Cursor::new(fasta_data));
277
278        let (id, seq) = reader.read_contig_converted().unwrap().unwrap();
279        assert_eq!(id, "test");
280        // A=0, C=1, G=2, T=3
281        assert_eq!(seq, vec![0, 1, 2, 3]);
282    }
283
284    #[test]
285    fn test_cnv_num_table() {
286        // Test uppercase
287        assert_eq!(CNV_NUM[b'A' as usize], 0);
288        assert_eq!(CNV_NUM[b'C' as usize], 1);
289        assert_eq!(CNV_NUM[b'G' as usize], 2);
290        assert_eq!(CNV_NUM[b'T' as usize], 3);
291
292        // Test lowercase
293        assert_eq!(CNV_NUM[b'a' as usize], 0);
294        assert_eq!(CNV_NUM[b'c' as usize], 1);
295        assert_eq!(CNV_NUM[b'g' as usize], 2);
296        assert_eq!(CNV_NUM[b't' as usize], 3);
297    }
298}