#![allow(dead_code)]
use std::fs::File;
use std::io::{self, BufRead, BufReader, Read, Write};
use std::path::Path;
use flate2::read::MultiGzDecoder;
use ragc_common::Contig;
pub(crate) const CNV_NUM: [u8; 128] = [
b'A', b'C', b'G', b'T', b'N', b'R', b'Y', b'S', b'W', b'K', b'M', b'B', b'D', b'H', b'V', b'U',
b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b' ',
b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
b' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30,
30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30,
];
pub fn parse_sample_from_header(header: &str) -> (String, String) {
let parts: Vec<&str> = header.split('#').collect();
if parts.len() >= 3 {
let sample_name = format!("{}#{}", parts[0], parts[1]);
let contig_name = parts[2..].join("#"); (sample_name, contig_name)
} else {
("unknown".to_string(), header.to_string())
}
}
pub struct GenomeIO<R> {
reader: Option<BufReader<R>>,
buffer: Vec<u8>,
next_header: Option<Vec<u8>>, }
impl<R: Read> GenomeIO<R> {
pub fn new(reader: R) -> Self {
GenomeIO {
reader: Some(BufReader::with_capacity(4 << 20, reader)),
buffer: Vec::with_capacity(4 << 20),
next_header: None,
}
}
pub fn read_contig(&mut self) -> io::Result<Option<(String, Contig)>> {
self.read_contig_impl(false)
}
pub fn read_contig_converted(&mut self) -> io::Result<Option<(String, Contig)>> {
self.read_contig_impl(true)
}
pub fn read_contig_with_sample(
&mut self,
) -> io::Result<Option<(String, String, String, Contig)>> {
let (full_header, sequence) = match self.read_contig_impl(true)? {
Some(result) => result,
None => return Ok(None),
};
let (sample_name, contig_name) = parse_sample_from_header(&full_header);
Ok(Some((full_header, sample_name, contig_name, sequence)))
}
pub fn read_contig_raw(&mut self) -> io::Result<Option<(String, Contig)>> {
let reader = match &mut self.reader {
Some(r) => r,
None => return Ok(None),
};
let mut contig = Contig::new();
let header_line = if let Some(buffered) = self.next_header.take() {
buffered
} else {
self.buffer.clear();
let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
if bytes_read == 0 {
return Ok(None);
}
self.buffer.clone()
};
let id_line = String::from_utf8_lossy(&header_line);
let id = id_line.trim_start_matches('>').trim().to_string();
loop {
self.buffer.clear();
let bytes_read = reader.read_until(b'\n', &mut self.buffer)?;
if bytes_read == 0 {
break;
}
if !self.buffer.is_empty() && self.buffer[0] == b'>' {
self.next_header = Some(self.buffer.clone());
break;
}
contig.extend_from_slice(&self.buffer);
}
if id.is_empty() || contig.is_empty() {
return Ok(None);
}
Ok(Some((id, contig)))
}
fn read_contig_impl(&mut self, converted: bool) -> io::Result<Option<(String, Contig)>> {
let (id, raw_contig) = match self.read_contig_raw()? {
Some(result) => result,
None => return Ok(None),
};
let mut contig = Contig::with_capacity(raw_contig.len());
for &c in &raw_contig {
if c > 64 && (c as usize) < CNV_NUM.len() {
if converted {
contig.push(CNV_NUM[c as usize]);
} else {
contig.push(c);
}
}
}
Ok(Some((id, contig)))
}
}
impl GenomeIO<File> {
fn open_raw<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let file = File::open(path)?;
Ok(GenomeIO::new(file))
}
}
impl GenomeIO<Box<dyn Read>> {
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let reader: Box<dyn Read> = if path.extension().and_then(|s| s.to_str()) == Some("gz") {
Box::new(MultiGzDecoder::new(file))
} else {
Box::new(file)
};
Ok(GenomeIO::new(reader))
}
}
pub struct GenomeWriter<W> {
writer: W,
}
impl<W: Write> GenomeWriter<W> {
pub fn new(writer: W) -> Self {
GenomeWriter { writer }
}
pub fn save_contig_directly(
&mut self,
id: &str,
contig: &Contig,
_gzip_level: u32,
) -> io::Result<()> {
writeln!(self.writer, ">{id}")?;
const LINE_WIDTH: usize = 80;
for chunk in contig.chunks(LINE_WIDTH) {
self.writer.write_all(chunk)?;
writeln!(self.writer)?;
}
Ok(())
}
}
impl GenomeWriter<File> {
pub fn create<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let file = File::create(path)?;
Ok(GenomeWriter::new(file))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_read_simple_fasta() {
let fasta_data = b">seq1\nACGT\n>seq2\nTGCA\n";
let mut reader = GenomeIO::new(Cursor::new(fasta_data));
let (id, seq) = reader.read_contig().unwrap().unwrap();
assert_eq!(id, "seq1");
assert_eq!(seq, b"ACGT");
let (id, seq) = reader.read_contig().unwrap().unwrap();
assert_eq!(id, "seq2");
assert_eq!(seq, b"TGCA");
assert!(reader.read_contig().unwrap().is_none());
}
#[test]
fn test_read_converted_fasta() {
let fasta_data = b">test\nACGT\n";
let mut reader = GenomeIO::new(Cursor::new(fasta_data));
let (id, seq) = reader.read_contig_converted().unwrap().unwrap();
assert_eq!(id, "test");
assert_eq!(seq, vec![0, 1, 2, 3]);
}
#[test]
fn test_cnv_num_table() {
assert_eq!(CNV_NUM[b'A' as usize], 0);
assert_eq!(CNV_NUM[b'C' as usize], 1);
assert_eq!(CNV_NUM[b'G' as usize], 2);
assert_eq!(CNV_NUM[b'T' as usize], 3);
assert_eq!(CNV_NUM[b'a' as usize], 0);
assert_eq!(CNV_NUM[b'c' as usize], 1);
assert_eq!(CNV_NUM[b'g' as usize], 2);
assert_eq!(CNV_NUM[b't' as usize], 3);
}
}