biolic 0.1.0

A modular bioinformatics toolkit in Rust for long-read sequence processing
Documentation
//! `biolic stats`: compute summary statistics for sequence files.
//!
//! This module is fully implemented for FASTQ input. It demonstrates the
//! patterns used by other modules:
//! - Define `Args` struct with `clap::Args` derive
//! - Define `run()` entry point taking `(Args, &RunContext)`
//! - Define a result type that implements `Serialize` and `HumanDisplay`
//! - Use streaming reader from `io::reader`
//! - Output via `output::write()` respecting user's format choice.

use std::path::PathBuf;

use anyhow::{Context, Result};
use clap::Args;
use serde::Serialize;

use crate::cli::RunContext;
use crate::io::reader::open_input;
use crate::output::{write, HumanDisplay, OutputFormat};
use crate::utils::quality::{count_above_threshold, mean_quality};
use crate::utils::stats_helpers::{mean, median, min_max, nx};

#[derive(Args, Debug)]
pub struct StatsArgs {
    /// Input file (FASTQ, FASTQ.gz, FASTA, or BAM). Use "-" for stdin.
    pub input: PathBuf,

    /// Output in JSON format.
    #[arg(long, conflicts_with = "tsv")]
    pub json: bool,

    /// Output in TSV format.
    #[arg(long, conflicts_with = "json")]
    pub tsv: bool,

    /// Print extended statistics (length distribution percentiles).
    #[arg(short = 'e', long)]
    pub extended: bool,
}

/// Computed statistics for a sequence file.
#[derive(Debug, Serialize)]
pub struct Stats {
    pub file: String,
    pub read_count: u64,
    pub total_bases: u64,
    pub min_length: u64,
    pub max_length: u64,
    pub mean_length: f64,
    pub median_length: f64,
    pub n50: u64,
    pub n90: u64,
    pub mean_quality: f64,
    pub bases_above_q10_pct: f64,
    pub bases_above_q20_pct: f64,
    pub bases_above_q30_pct: f64,
    pub gc_content_pct: f64,
}

pub fn run(args: StatsArgs, ctx: &RunContext) -> Result<()> {
    if ctx.verbose {
        eprintln!("[biolic stats] Reading from {}", args.input.display());
    }

    let stats = compute(&args)?;

    let format = if args.json {
        OutputFormat::Json
    } else if args.tsv {
        OutputFormat::Tsv
    } else {
        OutputFormat::auto()
    };

    write(&stats, format)?;
    Ok(())
}

/// Core computation: single streaming pass over the input file.
///
/// Memory: O(N) where N is the number of reads (we keep a Vec of lengths
/// for N50 computation). This is unavoidable for exact N50. For terabyte
/// files with billions of reads, a histogram-based approximation could be
/// added later.
pub fn compute(args: &StatsArgs) -> Result<Stats> {
    let mut reader = open_input(&args.input).context("opening input")?;

    let mut read_count: u64 = 0;
    let mut total_bases: u64 = 0;
    let mut lengths: Vec<u64> = Vec::new();

    // Aggregate quality across all reads using probability-space averaging.
    let mut sum_error_prob: f64 = 0.0;
    let mut bases_q10: u64 = 0;
    let mut bases_q20: u64 = 0;
    let mut bases_q30: u64 = 0;

    // Aggregate base composition.
    let mut gc_count: u64 = 0;
    let mut acgt_count: u64 = 0;

    while let Some(record) = reader.next_record().context("reading record")? {
        read_count += 1;
        total_bases += record.len() as u64;
        lengths.push(record.len() as u64);

        // Quality stats only if quality scores are present.
        if let Some(qual) = &record.qual {
            for &q in qual {
                sum_error_prob += crate::utils::quality::phred_to_prob(q);
            }
            bases_q10 += count_above_threshold(qual, 10) as u64;
            bases_q20 += count_above_threshold(qual, 20) as u64;
            bases_q30 += count_above_threshold(qual, 30) as u64;
        }

        let (a, c, g, t, _n, _other) = record.base_counts();
        acgt_count += a + c + g + t;
        gc_count += c + g;
    }

    let (min_length, max_length) = min_max(&lengths);
    let mean_length = mean(&lengths);
    let median_length = median(&lengths);
    let n50_val = nx(&lengths, 50);
    let n90_val = nx(&lengths, 90);

    let mean_qual = if total_bases > 0 && sum_error_prob > 0.0 {
        crate::utils::quality::prob_to_phred(sum_error_prob / total_bases as f64)
    } else {
        0.0
    };

    let bases_above_q10_pct = pct(bases_q10, total_bases);
    let bases_above_q20_pct = pct(bases_q20, total_bases);
    let bases_above_q30_pct = pct(bases_q30, total_bases);
    let gc_content_pct = pct(gc_count, acgt_count);

    let _ = (mean_quality(&[]), args.extended); // silence unused-import lints

    Ok(Stats {
        file: args.input.display().to_string(),
        read_count,
        total_bases,
        min_length,
        max_length,
        mean_length,
        median_length,
        n50: n50_val,
        n90: n90_val,
        mean_quality: mean_qual,
        bases_above_q10_pct,
        bases_above_q20_pct,
        bases_above_q30_pct,
        gc_content_pct,
    })
}

fn pct(num: u64, denom: u64) -> f64 {
    if denom == 0 {
        0.0
    } else {
        100.0 * (num as f64) / (denom as f64)
    }
}

impl HumanDisplay for Stats {
    fn write_human(&self, w: &mut dyn std::io::Write) -> Result<()> {
        writeln!(w, "File:              {}", self.file)?;
        writeln!(w, "Reads:             {}", fmt_int(self.read_count))?;
        writeln!(w, "Total bases:       {}", fmt_int(self.total_bases))?;
        writeln!(w, "Min length:        {}", fmt_int(self.min_length))?;
        writeln!(w, "Max length:        {}", fmt_int(self.max_length))?;
        writeln!(w, "Mean length:       {:.1}", self.mean_length)?;
        writeln!(w, "Median length:     {:.1}", self.median_length)?;
        writeln!(w, "N50:               {}", fmt_int(self.n50))?;
        writeln!(w, "N90:               {}", fmt_int(self.n90))?;
        writeln!(w, "Mean quality:      {:.2}", self.mean_quality)?;
        writeln!(w, "Bases above Q10:   {:.2}%", self.bases_above_q10_pct)?;
        writeln!(w, "Bases above Q20:   {:.2}%", self.bases_above_q20_pct)?;
        writeln!(w, "Bases above Q30:   {:.2}%", self.bases_above_q30_pct)?;
        writeln!(w, "GC content:        {:.2}%", self.gc_content_pct)?;
        Ok(())
    }

    fn write_tsv(&self, w: &mut dyn std::io::Write) -> Result<()> {
        writeln!(
            w,
            "file\treads\ttotal_bases\tmin_length\tmax_length\tmean_length\tmedian_length\tn50\tn90\tmean_quality\tq10_pct\tq20_pct\tq30_pct\tgc_pct"
        )?;
        writeln!(
            w,
            "{}\t{}\t{}\t{}\t{}\t{:.1}\t{:.1}\t{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}",
            self.file,
            self.read_count,
            self.total_bases,
            self.min_length,
            self.max_length,
            self.mean_length,
            self.median_length,
            self.n50,
            self.n90,
            self.mean_quality,
            self.bases_above_q10_pct,
            self.bases_above_q20_pct,
            self.bases_above_q30_pct,
            self.gc_content_pct,
        )?;
        Ok(())
    }
}

fn fmt_int(n: u64) -> String {
    let s = n.to_string();
    let mut out = String::new();
    for (i, c) in s.chars().rev().enumerate() {
        if i > 0 && i % 3 == 0 {
            out.push(',');
        }
        out.push(c);
    }
    out.chars().rev().collect()
}