1#![allow(dead_code)]
5
6use std::fs::File;
7use std::io::{self, BufRead, BufReader, Read, Write};
8use std::path::Path;
9
10use flate2::read::MultiGzDecoder;
11use ragc_common::Contig;
12
13pub(crate) const CNV_NUM: [u8; 128] = [
16 b'A', b'C', b'G', b'T', b'N', b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V', b'U',
18 b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
20 b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
21 b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
22 b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
24 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
26 b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
28 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
30];
31
32pub fn parse_sample_from_header(header: &str) -> (String, String) {
41 let parts: Vec<&str> = header.split('#').collect();
43
44 if parts.len() >= 3 {
45 let sample_name = format!("{}#{}", parts[0], parts[1]);
47 let contig_name = parts[2..].join("#"); (sample_name, contig_name)
49 } else {
50 ("unknown".to_string(), header.to_string())
52 }
53}
54
55pub struct GenomeIO<R> {
57 reader: Option<BufReader<R>>,
58 buffer: Vec<u8>,
59 next_header: Option<Vec<u8>>, }
61
62impl<R: Read> GenomeIO<R> {
63 pub fn new(reader: R) -> Self {
65 GenomeIO {
66 reader: Some(BufReader::with_capacity(4 << 20, reader)),
67 buffer: Vec::with_capacity(4 << 20),
68 next_header: None,
69 }
70 }
71
72 pub fn read_contig(&mut self) -> io::Result<Option<(String, Contig)>> {
74 self.read_contig_impl(false)
75 }
76
77 pub fn read_contig_converted(&mut self) -> io::Result<Option<(String, Contig)>> {
79 self.read_contig_impl(true)
80 }
81
82 pub fn read_contig_with_sample(
88 &mut self,
89 ) -> io::Result<Option<(String, String, String, Contig)>> {
90 let (full_header, sequence) = match self.read_contig_impl(true)? {
91 Some(result) => result,
92 None => return Ok(None),
93 };
94
95 let (sample_name, contig_name) = parse_sample_from_header(&full_header);
97
98 Ok(Some((full_header, sample_name, contig_name, sequence)))
99 }
100
101 pub fn read_contig_raw(&mut self) -> io::Result<Option<(String, Contig)>> {
103 let reader = match &mut self.reader {
104 Some(r) => r,
105 None => return Ok(None),
106 };
107
108 let mut contig = Contig::new();
109
110 let header_line = if let Some(buffered) = self.next_header.take() {
113 buffered
114 } else {
115 self.buffer.clear();
116 let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
117 if bytes_read == 0 {
118 return Ok(None);
119 }
120 self.buffer.clone()
121 };
122
123 let id_line = String::from_utf8_lossy(&header_line);
125 let id = id_line.trim_start_matches('>').trim().to_string();
126
127 loop {
129 self.buffer.clear();
130 let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
131
132 if bytes_read == 0 {
133 break;
135 }
136
137 if !self.buffer.is_empty() && self.buffer[0] == b'>' {
139 self.next_header = Some(self.buffer.clone());
141 break;
142 }
143
144 contig.extend_from_slice(&self.buffer);
146 }
147
148 if id.is_empty() || contig.is_empty() {
149 return Ok(None);
150 }
151
152 Ok(Some((id, contig)))
153 }
154
155 fn read_contig_impl(&mut self, converted: bool) -> io::Result<Option<(String, Contig)>> {
157 let (id, raw_contig) = match self.read_contig_raw()? {
159 Some(result) => result,
160 None => return Ok(None),
161 };
162
163 let mut contig = Contig::with_capacity(raw_contig.len());
165
166 for &c in &raw_contig {
167 if c > 64 && (c as usize) < CNV_NUM.len() {
169 if converted {
170 contig.push(CNV_NUM[c as usize]);
171 } else {
172 contig.push(c);
173 }
174 }
175 }
176
177 Ok(Some((id, contig)))
178 }
179}
180
181impl GenomeIO<File> {
182 fn open_raw<P: AsRef<Path>>(path: P) -> io::Result<Self> {
184 let file = File::open(path)?;
185 Ok(GenomeIO::new(file))
186 }
187}
188
189impl GenomeIO<Box<dyn Read>> {
190 pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
192 let path = path.as_ref();
193 let file = File::open(path)?;
194
195 let reader: Box<dyn Read> = if path.extension().and_then(|s| s.to_str()) == Some("gz") {
197 Box::new(MultiGzDecoder::new(file))
199 } else {
200 Box::new(file)
201 };
202
203 Ok(GenomeIO::new(reader))
204 }
205}
206
207pub struct GenomeWriter<W> {
209 writer: W,
210}
211
212impl<W: Write> GenomeWriter<W> {
213 pub fn new(writer: W) -> Self {
215 GenomeWriter { writer }
216 }
217
218 pub fn save_contig_directly(
221 &mut self,
222 id: &str,
223 contig: &Contig,
224 _gzip_level: u32,
225 ) -> io::Result<()> {
226 writeln!(self.writer, ">{id}")?;
229
230 const LINE_WIDTH: usize = 80;
232 for chunk in contig.chunks(LINE_WIDTH) {
233 self.writer.write_all(chunk)?;
234 writeln!(self.writer)?;
235 }
236
237 Ok(())
238 }
239}
240
241impl GenomeWriter<File> {
242 pub fn create<P: AsRef<Path>>(path: P) -> io::Result<Self> {
244 let file = File::create(path)?;
245 Ok(GenomeWriter::new(file))
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252 use std::io::Cursor;
253
254 #[test]
255 fn test_read_simple_fasta() {
256 let fasta_data = b">seq1\nACGT\n>seq2\nTGCA\n";
257 let mut reader = GenomeIO::new(Cursor::new(fasta_data));
258
259 let (id, seq) = reader.read_contig().unwrap().unwrap();
261 assert_eq!(id, "seq1");
262 assert_eq!(seq, b"ACGT");
263
264 let (id, seq) = reader.read_contig().unwrap().unwrap();
266 assert_eq!(id, "seq2");
267 assert_eq!(seq, b"TGCA");
268
269 assert!(reader.read_contig().unwrap().is_none());
271 }
272
273 #[test]
274 fn test_read_converted_fasta() {
275 let fasta_data = b">test\nACGT\n";
276 let mut reader = GenomeIO::new(Cursor::new(fasta_data));
277
278 let (id, seq) = reader.read_contig_converted().unwrap().unwrap();
279 assert_eq!(id, "test");
280 assert_eq!(seq, vec![0, 1, 2, 3]);
282 }
283
284 #[test]
285 fn test_cnv_num_table() {
286 assert_eq!(CNV_NUM[b'A' as usize], 0);
288 assert_eq!(CNV_NUM[b'C' as usize], 1);
289 assert_eq!(CNV_NUM[b'G' as usize], 2);
290 assert_eq!(CNV_NUM[b'T' as usize], 3);
291
292 assert_eq!(CNV_NUM[b'a' as usize], 0);
294 assert_eq!(CNV_NUM[b'c' as usize], 1);
295 assert_eq!(CNV_NUM[b'g' as usize], 2);
296 assert_eq!(CNV_NUM[b't' as usize], 3);
297 }
298}