use std::fs::File;
use std::io::{self, BufRead, BufReader, Read, Seek, Stdin};
use std::path::Path;
use bzip2_rs::DecoderReader;
use flate2::read::MultiGzDecoder;
use super::{Sequence, SequenceFile};
use crate::config::FastQCConfig;
enum ReaderKind {
Plain(BufReader<File>),
Gzip(BufReader<MultiGzDecoder<File>>),
Bzip2(Box<BufReader<DecoderReader<File>>>),
Stdin(BufReader<Stdin>),
}
impl BufRead for ReaderKind {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
match self {
ReaderKind::Plain(r) => r.fill_buf(),
ReaderKind::Gzip(r) => r.fill_buf(),
ReaderKind::Bzip2(r) => r.fill_buf(),
ReaderKind::Stdin(r) => r.fill_buf(),
}
}
fn consume(&mut self, amt: usize) {
match self {
ReaderKind::Plain(r) => r.consume(amt),
ReaderKind::Gzip(r) => r.consume(amt),
ReaderKind::Bzip2(r) => r.consume(amt),
ReaderKind::Stdin(r) => r.consume(amt),
}
}
}
impl Read for ReaderKind {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
match self {
ReaderKind::Plain(r) => r.read(buf),
ReaderKind::Gzip(r) => r.read(buf),
ReaderKind::Bzip2(r) => r.read(buf),
ReaderKind::Stdin(r) => r.read(buf),
}
}
}
fn detect_compression_from_magic(path: &Path) -> io::Result<&'static str> {
let mut f = File::open(path)?;
let mut magic = [0u8; 2];
let n = f.read(&mut magic)?;
if n >= 2 {
if magic[0] == 0x1f && magic[1] == 0x8b {
return Ok("gz");
}
if magic[0] == 0x42 && magic[1] == 0x5a {
return Ok("bz2");
}
}
Ok("none")
}
pub struct FastQFile {
reader: ReaderKind,
name: String,
file_size: u64,
position_handle: Option<File>,
next_sequence: Option<Sequence>,
line_number: u64,
is_colorspace: bool,
colorspace_checked: bool,
casava_mode: bool,
nofilter: bool,
pub lowest_char: u8,
line_buf: String,
}
impl FastQFile {
pub fn open<P: AsRef<Path>>(config: &FastQCConfig, path: P) -> io::Result<Self> {
let path = path.as_ref();
let name = path
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_else(|| path.to_string_lossy().into_owned());
let is_stdin = name.starts_with("stdin");
let file_size = if is_stdin {
u64::MAX
} else {
std::fs::metadata(path)?.len()
};
let (reader, position_handle) = if is_stdin {
(ReaderKind::Stdin(BufReader::new(io::stdin())), None)
} else {
let lower_name = name.to_lowercase();
let compression = if lower_name.ends_with(".gz") {
"gz"
} else if lower_name.ends_with(".bz2") {
"bz2"
} else {
detect_compression_from_magic(path)?
};
let file = File::open(path)?;
let pos_handle = file.try_clone()?;
let rdr = match compression {
"gz" => ReaderKind::Gzip(BufReader::new(MultiGzDecoder::new(file))),
"bz2" => ReaderKind::Bzip2(Box::new(BufReader::new(DecoderReader::new(file)))),
_ => ReaderKind::Plain(BufReader::new(file)),
};
(rdr, Some(pos_handle))
};
let casava_mode = config.casava;
let nofilter = config.nofilter;
let mut fq = FastQFile {
reader,
name,
file_size,
position_handle,
next_sequence: None,
line_number: 0,
is_colorspace: false,
colorspace_checked: false,
casava_mode,
nofilter,
lowest_char: 255,
line_buf: String::with_capacity(512),
};
fq.read_next()?;
Ok(fq)
}
fn read_line(&mut self) -> io::Result<bool> {
self.line_buf.clear();
let n = self.reader.read_line(&mut self.line_buf)?;
self.line_number += 1;
if n == 0 {
return Ok(false);
}
while self.line_buf.ends_with('\n') || self.line_buf.ends_with('\r') {
self.line_buf.pop();
}
Ok(true)
}
fn read_next(&mut self) -> io::Result<()> {
loop {
if !self.read_line()? {
self.next_sequence = None;
return Ok(());
}
if !self.line_buf.is_empty() {
break;
}
}
if !self.line_buf.starts_with('@') {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("ID line didn't start with '@' at line {}", self.line_number),
));
}
let id = self.line_buf.clone();
self.line_buf.clear();
if !self.read_line()? {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"Ran out of data in the middle of a fastq entry. Your file is probably truncated",
));
}
let seq_bytes = self.line_buf.as_bytes().to_vec();
self.line_buf.clear();
if !self.read_line()? {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"Ran out of data in the middle of a fastq entry. Your file is probably truncated",
));
}
if !self.line_buf.starts_with('+') {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Midline '{}' didn't start with '+' at {}",
self.line_buf, self.line_number
),
));
}
self.line_buf.clear();
if !self.read_line()? {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"Ran out of data in the middle of a fastq entry. Your file is probably truncated",
));
}
let quality_bytes = self.line_buf.as_bytes().to_vec();
self.line_buf.clear();
for &b in &quality_bytes {
if b < self.lowest_char {
self.lowest_char = b;
}
}
if !self.colorspace_checked {
self.colorspace_checked = true;
let seq_str = std::str::from_utf8(&seq_bytes).unwrap_or("");
self.is_colorspace = check_colorspace(seq_str);
}
let is_filtered =
self.casava_mode && !self.nofilter && id.find(":Y:").is_some_and(|pos| pos > 0);
let mut sequence = if self.is_colorspace {
let seq_str = String::from_utf8(seq_bytes).unwrap_or_default();
let upper = seq_str.to_ascii_uppercase();
let bases = convert_colorspace_to_bases(&upper);
let mut s = Sequence::new(id, bases.into_bytes(), quality_bytes);
s.colorspace = Some(upper.into_bytes());
s
} else {
Sequence::new(id, seq_bytes, quality_bytes)
};
sequence.is_filtered = is_filtered;
self.next_sequence = Some(sequence);
Ok(())
}
}
impl SequenceFile for FastQFile {
fn next(&mut self) -> Option<io::Result<Sequence>> {
let current = self.next_sequence.take()?;
if let Err(e) = self.read_next() {
return Some(Err(e));
}
Some(Ok(current))
}
fn name(&self) -> &str {
&self.name
}
fn is_colorspace(&self) -> bool {
self.is_colorspace
}
fn percent_complete(&self) -> f64 {
if self.next_sequence.is_none() {
return 100.0;
}
if self.name.starts_with("stdin") {
return 0.0;
}
if let Some(ref handle) = self.position_handle {
if let Ok(mut h) = handle.try_clone() {
if let Ok(pos) = h.stream_position() {
return (pos as f64 / self.file_size as f64) * 100.0;
}
}
}
0.0
}
}
fn check_colorspace(seq: &str) -> bool {
let bytes = seq.as_bytes();
if bytes.len() < 2 {
return false;
}
if !matches!(
bytes[0],
b'G' | b'A' | b'T' | b'C' | b'N' | b'g' | b'a' | b't' | b'c' | b'n'
) {
return false;
}
for &b in &bytes[1..] {
if !matches!(b, b'.' | b'0'..=b'6') {
return false;
}
}
true
}
fn convert_colorspace_to_bases(s: &str) -> String {
let cs: Vec<u8> = s.as_bytes().to_vec();
if cs.is_empty() {
return String::new();
}
let mut bp = vec![0u8; cs.len() - 1];
for i in 1..cs.len() {
let ref_base = if i == 1 {
cs[0]
} else {
bp[i - 2]
};
debug_assert!(
matches!(ref_base, b'G' | b'A' | b'T' | b'C'),
"Colorspace sequence data should always start with a real DNA letter, got '{}'",
ref_base as char,
);
bp[i - 1] = match cs[i] {
b'0' => ref_base, b'1' => match ref_base {
b'A' => b'C',
b'C' => b'A',
b'G' => b'T',
b'T' => b'G',
_ => b'N',
},
b'2' => match ref_base {
b'A' => b'G',
b'G' => b'A',
b'C' => b'T',
b'T' => b'C',
_ => b'N',
},
b'3' => match ref_base {
b'A' => b'T',
b'T' => b'A',
b'G' => b'C',
b'C' => b'G',
_ => b'N',
},
b'.' | b'4' | b'5' | b'6' => {
for b in &mut bp[(i - 1)..] {
*b = b'N';
}
break;
}
other => {
panic!("Unexpected colorspace char '{}'", other as char);
}
};
}
String::from_utf8(bp).expect("colorspace output should be valid UTF-8")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_check_colorspace_positive() {
assert!(check_colorspace("G0123456"));
assert!(check_colorspace("A.012"));
assert!(check_colorspace("t00"));
}
#[test]
fn test_check_colorspace_negative() {
assert!(!check_colorspace("ACGTACGT"));
assert!(!check_colorspace("A")); assert!(!check_colorspace(""));
assert!(!check_colorspace("X012")); }
#[test]
fn test_convert_colorspace_basic() {
assert_eq!(convert_colorspace_to_bases("A0"), "A");
assert_eq!(convert_colorspace_to_bases("A1"), "C");
assert_eq!(convert_colorspace_to_bases("A2"), "G");
assert_eq!(convert_colorspace_to_bases("A3"), "T");
}
#[test]
fn test_convert_colorspace_chained() {
assert_eq!(convert_colorspace_to_bases("A00"), "AA");
assert_eq!(convert_colorspace_to_bases("A01"), "AC");
assert_eq!(convert_colorspace_to_bases("G10"), "TT");
}
#[test]
fn test_convert_colorspace_unknown_fills_n() {
assert_eq!(convert_colorspace_to_bases("A.12"), "NNN");
assert_eq!(convert_colorspace_to_bases("A04"), "AN");
}
#[test]
fn test_convert_colorspace_empty() {
assert_eq!(convert_colorspace_to_bases(""), "");
}
#[test]
fn test_read_minimal_fastq() {
let config = FastQCConfig::default();
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
let mut reader = FastQFile::open(&config, path).unwrap();
let seq = reader.next().unwrap().unwrap();
assert_eq!(seq.id, "@READ0001");
assert_eq!(seq.sequence, b"AAAAAAAAAAAAAAAA");
assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
assert!(!seq.is_filtered);
assert!(!reader.is_colorspace());
assert!(reader.next().is_none());
}
#[test]
fn test_read_complex_fastq() {
let config = FastQCConfig::default();
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/complex.fastq");
let mut reader = FastQFile::open(&config, path).unwrap();
let mut count = 0;
while let Some(result) = reader.next() {
let seq = result.unwrap();
count += 1;
assert_eq!(seq.sequence, b"ACGTACGTACGTACGT");
assert_eq!(seq.quality, b"IIIIIIIIIIIIIIII");
assert_eq!(seq.id, format!("@READ{:04}", count));
}
assert_eq!(count, 5);
}
#[test]
fn test_lowest_char_tracking() {
let config = FastQCConfig::default();
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
let mut reader = FastQFile::open(&config, path).unwrap();
while reader.next().is_some() {}
assert_eq!(reader.lowest_char, b'I');
}
#[test]
fn test_casava_filter_detection() {
let config = FastQCConfig {
casava: true,
nofilter: false,
..FastQCConfig::default()
};
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/minimal.fastq");
let mut reader = FastQFile::open(&config, path).unwrap();
let seq = reader.next().unwrap().unwrap();
assert!(!seq.is_filtered);
}
#[test]
fn test_sequence_uppercase() {
let seq = Sequence::new(
"@test".to_string(),
b"acgtACGT".to_vec(),
b"IIIIIIII".to_vec(),
);
assert_eq!(seq.sequence, b"ACGTACGT");
}
}