use anyhow::Result;
use clap::Args;
use crate::core::tables::LOOKUP_TABLES;
use crate::core::{SeqReader, SeqRecord};
#[derive(Args, Debug)]
pub struct FqchkArgs {
#[arg(value_name = "in.fq")]
pub input: String,
#[arg(short = 'q', default_value = "20")]
pub quality_threshold: usize,
}
#[derive(Debug, Clone)]
struct PosStats {
quality: [u64; 94],
bases: [u64; 5],
}
impl PosStats {
fn new() -> Self {
Self {
quality: [0; 94],
bases: [0; 5],
}
}
fn add(&mut self, base: u8, qual: u8, offset: u8) {
let q = (qual.saturating_sub(offset) as usize).min(93);
self.quality[q] += 1;
let nt6 = LOOKUP_TABLES.nt6[base as usize];
let base_idx = if (1..=4).contains(&nt6) {
(nt6 - 1) as usize
} else {
4 };
self.bases[base_idx] += 1;
}
fn merge(&mut self, other: &PosStats) {
for i in 0..94 {
self.quality[i] += other.quality[i];
}
for i in 0..5 {
self.bases[i] += other.bases[i];
}
}
fn total_bases(&self) -> u64 {
self.bases.iter().sum()
}
fn avg_quality(&self) -> f64 {
let total: u64 = self.quality.iter().sum();
if total == 0 {
return 0.0;
}
let sum: u64 = self
.quality
.iter()
.enumerate()
.map(|(q, &count)| q as u64 * count)
.sum();
sum as f64 / total as f64
}
fn error_quality(&self, perr: &[f64; 94]) -> f64 {
let total: u64 = self.quality.iter().sum();
if total == 0 {
return 0.0;
}
let psum: f64 = self
.quality
.iter()
.enumerate()
.map(|(q, &count)| count as f64 * perr[q])
.sum();
-4.343 * ((psum + 1e-6) / (total as f64 + 1e-6)).ln()
}
}
pub fn run(args: &FqchkArgs) -> Result<()> {
let mut reader = if args.input == "-" {
SeqReader::from_stdin()
} else {
SeqReader::from_path(&args.input)?
};
let mut n_reads = 0u64;
let mut tot_len = 0u64;
let mut min_len = usize::MAX;
let mut max_len = 0usize;
let mut pos_stats: Vec<PosStats> = Vec::new();
let mut perr = [0.0f64; 94];
for (k, item) in perr.iter_mut().enumerate() {
*item = 10.0f64.powf(-0.1 * k as f64);
}
perr[0] = 0.5;
perr[1] = 0.5;
perr[2] = 0.5;
perr[3] = 0.5;
let offset = 33u8;
let mut record = SeqRecord::new(Vec::new(), Vec::new());
while reader.read_next(&mut record)? {
if !record.is_fastq() {
continue;
}
n_reads += 1;
let seq_len = record.seq.len();
tot_len += seq_len as u64;
min_len = min_len.min(seq_len);
max_len = max_len.max(seq_len);
while pos_stats.len() < seq_len {
pos_stats.push(PosStats::new());
}
if let Some(ref qual) = record.qual {
for i in 0..seq_len {
pos_stats[i].add(record.seq[i], qual[i], offset);
}
}
}
if n_reads == 0 {
eprintln!("警告:未找到任何 FASTQ 序列");
return Ok(());
}
let mut all_stats = PosStats::new();
for pos in &pos_stats {
all_stats.merge(pos);
}
let n_diff_q = all_stats.quality.iter().filter(|&&q| q > 0).count();
println!(
"min_len: {}; max_len: {}; avg_len: {:.2}; {} distinct quality values",
min_len,
max_len,
tot_len as f64 / n_reads as f64,
n_diff_q
);
print!("POS\t#bases\t%A\t%C\t%G\t%T\t%N\tavgQ\terrQ");
if args.quality_threshold == 0 {
for k in 0..=93 {
if all_stats.quality[k] > 0 {
print!("\t%Q{}", k);
}
}
} else {
print!("\t%low\t%high");
}
println!();
print_pos_stats(&all_stats, 0, &perr, args.quality_threshold, &all_stats);
for (i, pos) in pos_stats.iter().enumerate() {
print_pos_stats(pos, i + 1, &perr, args.quality_threshold, &all_stats);
}
Ok(())
}
fn print_pos_stats(
stats: &PosStats,
pos: usize,
perr: &[f64; 94],
qthres: usize,
all_stats: &PosStats,
) {
let total = stats.total_bases();
if total == 0 {
return;
}
if pos == 0 {
print!("ALL");
} else {
print!("{}", pos);
}
print!("\t{}", total);
for i in 0..5 {
let pct = 100.0 * stats.bases[i] as f64 / total as f64;
print!("\t{:.1}", pct);
}
print!("\t{:.1}", stats.avg_quality());
print!("\t{:.1}", stats.error_quality(perr));
if qthres == 0 {
for k in 0..=93 {
if all_stats.quality[k] > 0 {
let pct = 100.0 * stats.quality[k] as f64 / total as f64;
print!("\t{:.2}", pct);
}
}
} else {
let sum_low: u64 = stats.quality[..qthres].iter().sum();
let sum_high = total - sum_low;
let pct_low = 100.0 * sum_low as f64 / total as f64;
let pct_high = 100.0 * sum_high as f64 / total as f64;
print!("\t{:.1}\t{:.1}", pct_low, pct_high);
}
println!();
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pos_stats_add() {
let mut stats = PosStats::new();
stats.add(b'A', b'I', 33);
assert_eq!(stats.bases[0], 1); assert_eq!(stats.quality[40], 1); }
#[test]
fn test_pos_stats_merge() {
let mut stats1 = PosStats::new();
stats1.add(b'A', b'I', 33);
let mut stats2 = PosStats::new();
stats2.add(b'C', b'I', 33);
stats1.merge(&stats2);
assert_eq!(stats1.bases[0], 1); assert_eq!(stats1.bases[1], 1); assert_eq!(stats1.quality[40], 2); }
#[test]
fn test_pos_stats_avg_quality() {
let mut stats = PosStats::new();
stats.quality[30] = 1;
stats.quality[40] = 1;
let avg = stats.avg_quality();
assert!((avg - 35.0).abs() < 0.01);
}
#[test]
fn test_pos_stats_total_bases() {
let mut stats = PosStats::new();
stats.bases[0] = 10; stats.bases[1] = 20; stats.bases[2] = 30; stats.bases[3] = 40;
assert_eq!(stats.total_bases(), 100);
}
}