use crate::core::{GenomicReader, GenomicRecordIterator};
use crate::error::{Error, Result};
use crate::io::Compression;
use flate2::read::MultiGzDecoder;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QualityEncoding {
Phred33,
Phred64,
}
impl QualityEncoding {
pub fn offset(&self) -> u8 {
match self {
QualityEncoding::Phred33 => 33,
QualityEncoding::Phred64 => 64,
}
}
pub fn detect(quality: &str) -> Self {
for byte in quality.bytes() {
if byte < 64 {
return QualityEncoding::Phred33;
}
}
QualityEncoding::Phred33
}
}
#[derive(Debug, Clone)]
pub struct FastqRecord {
pub id: String,
pub description: Option<String>,
pub sequence: String,
pub quality: String,
}
impl FastqRecord {
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn phred_scores(&self, encoding: QualityEncoding) -> Vec<u8> {
let offset = encoding.offset();
self.quality
.bytes()
.map(|b| b.saturating_sub(offset))
.collect()
}
pub fn mean_quality(&self) -> f64 {
self.mean_quality_with_encoding(QualityEncoding::Phred33)
}
pub fn mean_quality_with_encoding(&self, encoding: QualityEncoding) -> f64 {
let scores = self.phred_scores(encoding);
if scores.is_empty() {
return 0.0;
}
let sum: u32 = scores.iter().map(|&s| s as u32).sum();
sum as f64 / scores.len() as f64
}
pub fn is_valid(&self) -> bool {
self.sequence.len() == self.quality.len()
}
pub fn gc_content(&self) -> f64 {
let gc_count = self
.sequence
.bytes()
.filter(|&b| b == b'G' || b == b'C' || b == b'g' || b == b'c')
.count();
if self.sequence.is_empty() {
0.0
} else {
gc_count as f64 / self.sequence.len() as f64
}
}
pub fn n_content(&self) -> f64 {
let n_count = self
.sequence
.bytes()
.filter(|&b| b == b'N' || b == b'n')
.count();
if self.sequence.is_empty() {
0.0
} else {
n_count as f64 / self.sequence.len() as f64
}
}
}
#[derive(Debug, Clone, Default)]
pub struct FastqHeader {
pub encoding: Option<QualityEncoding>,
}
pub struct FastqReader {
reader: Box<dyn BufRead>,
header: FastqHeader,
line_number: usize,
id_buffer: String,
seq_buffer: String,
sep_buffer: String,
qual_buffer: String,
}
impl FastqReader {
pub fn from_buffer<R: BufRead + 'static>(reader: R) -> Self {
Self {
reader: Box::new(reader),
header: FastqHeader::default(),
line_number: 0,
id_buffer: String::with_capacity(256),
seq_buffer: String::with_capacity(256),
sep_buffer: String::with_capacity(4),
qual_buffer: String::with_capacity(256),
}
}
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let compression = Compression::from_path(path);
let reader: Box<dyn BufRead> = match compression {
Compression::Gzip | Compression::Bgzip => {
Box::new(BufReader::new(MultiGzDecoder::new(file)))
}
_ => Box::new(BufReader::new(file)),
};
Ok(Self {
reader,
header: FastqHeader::default(),
line_number: 0,
id_buffer: String::with_capacity(256),
seq_buffer: String::with_capacity(256),
sep_buffer: String::with_capacity(4),
qual_buffer: String::with_capacity(256),
})
}
pub fn header(&self) -> &FastqHeader {
&self.header
}
fn parse_record(&mut self) -> Result<Option<FastqRecord>> {
self.id_buffer.clear();
let bytes = self.reader.read_line(&mut self.id_buffer)?;
if bytes > 0 {
self.line_number += 1;
} else {
return Ok(None); }
let id_line = self.id_buffer.trim();
if !id_line.starts_with('@') {
return Err(Error::Parse(format!(
"Line {}: Expected '@' at start of FASTQ record, got: {}",
self.line_number,
id_line.chars().take(50).collect::<String>()
)));
}
let header_content = &id_line[1..]; let (id, description) = if let Some((id, desc)) = header_content.split_once(char::is_whitespace) {
(id.to_string(), Some(desc.trim().to_string()))
} else {
(header_content.to_string(), None)
};
self.seq_buffer.clear();
let bytes = self.reader.read_line(&mut self.seq_buffer)?;
if bytes > 0 {
self.line_number += 1;
} else {
return Err(Error::Parse(format!(
"Line {}: Unexpected end of file while reading sequence",
self.line_number
)));
}
let sequence = self.seq_buffer.trim().to_string();
self.sep_buffer.clear();
let bytes = self.reader.read_line(&mut self.sep_buffer)?;
if bytes > 0 {
self.line_number += 1;
} else {
return Err(Error::Parse(format!(
"Line {}: Unexpected end of file while reading separator",
self.line_number
)));
}
let separator = self.sep_buffer.trim();
if !separator.starts_with('+') {
return Err(Error::Parse(format!(
"Line {}: Expected '+' separator, got: {}",
self.line_number, separator
)));
}
self.qual_buffer.clear();
let bytes = self.reader.read_line(&mut self.qual_buffer)?;
if bytes > 0 {
self.line_number += 1;
} else {
return Err(Error::Parse(format!(
"Line {}: Unexpected end of file while reading quality",
self.line_number
)));
}
let quality = self.qual_buffer.trim().to_string();
if sequence.len() != quality.len() {
return Err(Error::Parse(format!(
"Line {}: Sequence length ({}) does not match quality length ({})",
self.line_number,
sequence.len(),
quality.len()
)));
}
if self.header.encoding.is_none() && !quality.is_empty() {
self.header.encoding = Some(QualityEncoding::detect(&quality));
}
Ok(Some(FastqRecord {
id,
description,
sequence,
quality,
}))
}
}
impl GenomicRecordIterator for FastqReader {
type Record = FastqRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
Ok(None)
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
self.parse_record()
}
}
impl GenomicReader for FastqReader {
type Metadata = FastqHeader;
fn metadata(&self) -> &Self::Metadata {
&self.header
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_fastq_basic_read() -> Result<()> {
let fastq_data = "@READ1 description here\n\
ACGTACGTACGT\n\
+\n\
IIIIIIIIIIII\n\
@READ2\n\
GGGGCCCCAAAA\n\
+READ2\n\
HHHHHHHHHHHH\n";
let mut temp_file = NamedTempFile::new()?;
temp_file.write_all(fastq_data.as_bytes())?;
temp_file.flush()?;
let mut reader = FastqReader::from_path(temp_file.path())?;
let record1 = reader.next_record()?.expect("Should have first record");
assert_eq!(record1.id, "READ1");
assert_eq!(record1.description, Some("description here".to_string()));
assert_eq!(record1.sequence, "ACGTACGTACGT");
assert_eq!(record1.quality, "IIIIIIIIIIII");
assert_eq!(record1.len(), 12);
assert!(record1.is_valid());
let record2 = reader.next_record()?.expect("Should have second record");
assert_eq!(record2.id, "READ2");
assert_eq!(record2.description, None);
assert_eq!(record2.sequence, "GGGGCCCCAAAA");
assert_eq!(record2.quality, "HHHHHHHHHHHH");
assert!(reader.next_record()?.is_none());
Ok(())
}
#[test]
fn test_phred_scores() {
let record = FastqRecord {
id: "test".to_string(),
description: None,
sequence: "ACGT".to_string(),
quality: "!5IA".to_string(), };
let scores = record.phred_scores(QualityEncoding::Phred33);
assert_eq!(scores, vec![0, 20, 40, 32]);
let mean = record.mean_quality();
assert!((mean - 23.0).abs() < 0.1);
}
#[test]
fn test_gc_content() {
let record = FastqRecord {
id: "test".to_string(),
description: None,
sequence: "ACGTACGT".to_string(), quality: "IIIIIIII".to_string(),
};
assert_eq!(record.gc_content(), 0.5);
}
#[test]
fn test_invalid_record() {
let fastq_data = "@READ1\n\
ACGT\n\
+\n\
III\n";
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(fastq_data.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
let result = reader.next_record();
assert!(result.is_err());
}
#[test]
fn test_quality_encoding_detection() {
assert_eq!(
QualityEncoding::detect("!5IA"),
QualityEncoding::Phred33
);
assert_eq!(
QualityEncoding::detect("@@@@"),
QualityEncoding::Phred33 );
}
#[test]
fn test_missing_separator() {
let fastq_data = "@READ1\n\
ACGT\n\
-\n\
IIII\n";
let mut temp_file = NamedTempFile::new().unwrap();
temp_file.write_all(fastq_data.as_bytes()).unwrap();
temp_file.flush().unwrap();
let mut reader = FastqReader::from_path(temp_file.path()).unwrap();
let result = reader.next_record();
assert!(result.is_err());
}
}