use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::parallel::Mergeable;
use crate::stats::{CategoryCounter, RunningStats};
use super::reader::{FastqReader, FastqRecord, QualityEncoding};
#[derive(Debug, Clone)]
pub struct FastqStats {
pub total_reads: usize,
pub total_bases: usize,
pub length_stats: RunningStats,
pub mean_quality_stats: RunningStats,
pub gc_content_stats: RunningStats,
pub length_distribution: CategoryCounter<usize>,
pub quality_distribution: CategoryCounter<u8>,
pub high_quality_reads_q20: usize,
pub high_quality_reads_q30: usize,
pub encoding: Option<QualityEncoding>,
pub n_bases: usize,
pub base_counts: CategoryCounter<char>,
}
impl Default for FastqStats {
fn default() -> Self {
Self::new()
}
}
impl FastqStats {
pub fn new() -> Self {
Self {
total_reads: 0,
total_bases: 0,
length_stats: RunningStats::new(),
mean_quality_stats: RunningStats::new(),
gc_content_stats: RunningStats::new(),
length_distribution: CategoryCounter::new(),
quality_distribution: CategoryCounter::new(),
high_quality_reads_q20: 0,
high_quality_reads_q30: 0,
encoding: None,
n_bases: 0,
base_counts: CategoryCounter::new(),
}
}
pub fn update(&mut self, record: &FastqRecord) {
self.total_reads += 1;
let length = record.len();
self.total_bases += length;
self.length_stats.push(length as f64);
self.length_distribution.increment(length);
let mut gc_count = 0;
let mut qual_sum = 0u32;
for (seq_byte, qual_byte) in record.sequence.bytes().zip(record.quality.bytes()) {
qual_sum += (qual_byte.saturating_sub(33)) as u32;
if seq_byte == b'G' || seq_byte == b'C' || seq_byte == b'g' || seq_byte == b'c' {
gc_count += 1;
}
let base = seq_byte as char;
self.base_counts.increment(base);
if base == 'N' || base == 'n' {
self.n_bases += 1;
}
}
let mean_qual = if length > 0 {
qual_sum as f64 / length as f64
} else {
0.0
};
self.mean_quality_stats.push(mean_qual);
self.quality_distribution.increment(mean_qual as u8);
if mean_qual >= 20.0 {
self.high_quality_reads_q20 += 1;
}
if mean_qual >= 30.0 {
self.high_quality_reads_q30 += 1;
}
let gc_content = if length > 0 {
gc_count as f64 / length as f64
} else {
0.0
};
self.gc_content_stats.push(gc_content);
}
pub fn compute(reader: &mut FastqReader) -> Result<Self> {
let mut stats = Self::new();
while let Some(record) = reader.next_record()? {
if stats.encoding.is_none() {
stats.encoding = reader.header().encoding;
}
stats.update(&record);
}
Ok(stats)
}
pub fn par_compute<P: AsRef<std::path::Path>>(
path: P,
config: Option<crate::parallel::ParallelConfig>,
) -> Result<Self> {
use crate::parallel::{calculate_chunks, find_record_boundary};
use rayon::prelude::*;
use std::fs::File;
use std::io::{BufReader, Seek};
let config = config.unwrap_or_default();
let path = path.as_ref();
let file_size = std::fs::metadata(path)?.len();
if file_size < config.min_chunk_size as u64 {
let mut reader = FastqReader::from_path(path)?;
return Self::compute(&mut reader);
}
let chunks = calculate_chunks(file_size, &config);
let pool = crate::parallel::get_thread_pool(config.threads())?;
let partial_stats: Vec<Self> = pool.install(|| {
chunks
.par_iter()
.map(|chunk| -> Result<Self> {
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let start_pos = find_record_boundary(&mut reader, chunk.start)?;
let start_pos = match start_pos {
Some(pos) => pos,
None => return Ok(Self::new()), };
reader.seek(std::io::SeekFrom::Start(start_pos))?;
let mut fastq_reader = FastqReader::from_buffer(reader);
let mut stats = Self::new();
let mut current_pos = start_pos;
while let Some(record) = fastq_reader.next_record()? {
if stats.encoding.is_none() {
stats.encoding = fastq_reader.header().encoding;
}
stats.update(&record);
current_pos += record.len() as u64 * 2 + 40;
if current_pos >= chunk.end {
break;
}
}
Ok(stats)
})
.collect::<Result<Vec<_>>>()
})?;
Self::merge_all(partial_stats)
.ok_or_else(|| crate::error::Error::InvalidInput("No statistics computed".to_string()))
}
pub fn mean_length(&self) -> Option<f64> {
self.length_stats.mean()
}
pub fn std_length(&self) -> Option<f64> {
self.length_stats.std_dev()
}
pub fn min_length(&self) -> Option<f64> {
self.length_stats.min()
}
pub fn max_length(&self) -> Option<f64> {
self.length_stats.max()
}
pub fn mean_quality(&self) -> Option<f64> {
self.mean_quality_stats.mean()
}
pub fn std_quality(&self) -> Option<f64> {
self.mean_quality_stats.std_dev()
}
pub fn mean_gc_content(&self) -> Option<f64> {
self.gc_content_stats.mean()
}
pub fn std_gc_content(&self) -> Option<f64> {
self.gc_content_stats.std_dev()
}
pub fn percent_q20(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.high_quality_reads_q20 as f64 / self.total_reads as f64) * 100.0
}
}
pub fn percent_q30(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.high_quality_reads_q30 as f64 / self.total_reads as f64) * 100.0
}
}
pub fn percent_n(&self) -> f64 {
if self.total_bases == 0 {
0.0
} else {
(self.n_bases as f64 / self.total_bases as f64) * 100.0
}
}
pub fn base_composition(&self) -> BaseComposition {
let total = self.total_bases as f64;
if total == 0.0 {
return BaseComposition::default();
}
BaseComposition {
a_percent: (self.base_counts.get(&'A') as f64 / total) * 100.0,
c_percent: (self.base_counts.get(&'C') as f64 / total) * 100.0,
g_percent: (self.base_counts.get(&'G') as f64 / total) * 100.0,
t_percent: (self.base_counts.get(&'T') as f64 / total) * 100.0,
n_percent: self.percent_n(),
}
}
pub fn print_summary(&self) {
println!("=== FASTQ Statistics ===\n");
println!("Basic Counts:");
println!(" Total reads: {:>12}", self.total_reads);
println!(" Total bases: {:>12}", self.total_bases);
println!();
println!("Read Length:");
println!(" Mean: {:>12.2}", self.mean_length().unwrap_or(0.0));
println!(" Std Dev: {:>12.2}", self.std_length().unwrap_or(0.0));
println!(
" Min: {:>12.0}",
self.min_length().unwrap_or(0.0)
);
println!(
" Max: {:>12.0}",
self.max_length().unwrap_or(0.0)
);
println!();
println!("Quality Scores:");
if let Some(encoding) = self.encoding {
println!(" Encoding: {:>12?}", encoding);
}
println!(
" Mean quality: {:>12.2}",
self.mean_quality().unwrap_or(0.0)
);
println!(
" Std Dev: {:>12.2}",
self.std_quality().unwrap_or(0.0)
);
println!(" Q >= 20: {:>11.1}%", self.percent_q20());
println!(" Q >= 30: {:>11.1}%", self.percent_q30());
println!();
println!("Base Composition:");
let composition = self.base_composition();
println!(" A: {:>11.1}%", composition.a_percent);
println!(" C: {:>11.1}%", composition.c_percent);
println!(" G: {:>11.1}%", composition.g_percent);
println!(" T: {:>11.1}%", composition.t_percent);
println!(" N (ambiguous): {:>11.1}%", composition.n_percent);
println!();
println!("GC Content:");
println!(
" Mean: {:>11.1}%",
self.mean_gc_content().unwrap_or(0.0) * 100.0
);
println!(
" Std Dev: {:>11.1}%",
self.std_gc_content().unwrap_or(0.0) * 100.0
);
println!();
println!("Top Read Lengths:");
let mut length_vec: Vec<_> = self.length_distribution.iter().collect();
length_vec.sort_by(|a, b| b.1.cmp(a.1)); for (length, count) in length_vec.iter().take(10) {
let percent = (**count as f64 / self.total_reads as f64) * 100.0;
println!(" {:>4}bp: {:>10} ({:>5.1}%)", length, count, percent);
}
println!();
println!("Quality Distribution (binned):");
let mut qual_vec: Vec<_> = self.quality_distribution.iter().collect();
qual_vec.sort_by_key(|a| a.0); for (qual_bin, count) in qual_vec.iter().take(10) {
let percent = (**count as f64 / self.total_reads as f64) * 100.0;
println!(" Q{:>2}: {:>10} ({:>5.1}%)", qual_bin, count, percent);
}
}
}
#[derive(Debug, Clone, Default)]
pub struct BaseComposition {
pub a_percent: f64,
pub c_percent: f64,
pub g_percent: f64,
pub t_percent: f64,
pub n_percent: f64,
}
impl Mergeable for FastqStats {
fn merge(&mut self, other: Self) {
self.total_reads += other.total_reads;
self.total_bases += other.total_bases;
self.high_quality_reads_q20 += other.high_quality_reads_q20;
self.high_quality_reads_q30 += other.high_quality_reads_q30;
self.n_bases += other.n_bases;
self.length_stats.merge(other.length_stats);
self.mean_quality_stats.merge(other.mean_quality_stats);
self.gc_content_stats.merge(other.gc_content_stats);
self.length_distribution.merge(other.length_distribution);
self.quality_distribution.merge(other.quality_distribution);
self.base_counts.merge(other.base_counts);
if self.encoding.is_none() {
self.encoding = other.encoding;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_test_fastq() -> NamedTempFile {
let mut temp_file = NamedTempFile::new().unwrap();
writeln!(temp_file, "@READ1").unwrap();
writeln!(temp_file, "ACGTACGT").unwrap(); writeln!(temp_file, "+").unwrap();
writeln!(temp_file, "IIIIIIII").unwrap();
writeln!(temp_file, "@READ2").unwrap();
writeln!(temp_file, "AAAATTTT").unwrap(); writeln!(temp_file, "+").unwrap();
writeln!(temp_file, "########").unwrap();
writeln!(temp_file, "@READ3").unwrap();
writeln!(temp_file, "GGGGCCCCNNNN").unwrap(); writeln!(temp_file, "+").unwrap();
writeln!(temp_file, "IIIIIIIIIIII").unwrap();
temp_file.flush().unwrap();
temp_file
}
#[test]
fn test_fastq_stats_basic() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
assert_eq!(stats.total_reads, 3);
assert_eq!(stats.total_bases, 28);
Ok(())
}
#[test]
fn test_fastq_stats_length() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
let mean = stats.mean_length().unwrap();
assert!((mean - 9.33).abs() < 0.1);
assert_eq!(stats.min_length().unwrap(), 8.0);
assert_eq!(stats.max_length().unwrap(), 12.0);
Ok(())
}
#[test]
fn test_fastq_stats_quality() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
assert!(stats.high_quality_reads_q30 >= 2);
assert!(stats.percent_q30() >= 60.0);
Ok(())
}
#[test]
fn test_fastq_stats_gc_content() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
let mean_gc = stats.mean_gc_content().unwrap();
assert!((mean_gc - 0.389).abs() < 0.01);
Ok(())
}
#[test]
fn test_fastq_stats_base_composition() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
let composition = stats.base_composition();
assert!((composition.a_percent - 21.4).abs() < 1.0); assert!((composition.c_percent - 21.4).abs() < 1.0); assert!((composition.g_percent - 21.4).abs() < 1.0); assert!((composition.t_percent - 21.4).abs() < 1.0); assert!((composition.n_percent - 14.3).abs() < 1.0);
Ok(())
}
#[test]
fn test_fastq_stats_n_bases() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let stats = FastqStats::compute(&mut reader)?;
assert_eq!(stats.n_bases, 4); assert!((stats.percent_n() - 14.3).abs() < 1.0);
Ok(())
}
#[test]
fn test_fastq_stats_streaming() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let mut stats = FastqStats::new();
let mut count = 0;
while let Some(record) = reader.next_record()? {
stats.update(&record);
count += 1;
}
assert_eq!(count, 3);
assert_eq!(stats.total_reads, 3);
Ok(())
}
#[test]
fn test_fastq_stats_merge() -> Result<()> {
let temp_file = create_test_fastq();
let mut reader = FastqReader::from_path(temp_file.path())?;
let mut stats1 = FastqStats::new();
let mut stats2 = FastqStats::new();
if let Some(record) = reader.next_record()? {
stats1.update(&record);
}
while let Some(record) = reader.next_record()? {
stats2.update(&record);
}
assert_eq!(stats1.total_reads, 1);
assert_eq!(stats2.total_reads, 2);
stats1.merge(stats2);
assert_eq!(stats1.total_reads, 3);
assert_eq!(stats1.total_bases, 28);
Ok(())
}
}