rustqc 0.1.0

Fast RNA-seq QC in a single pass: dupRadar, featureCounts, 8 RSeQC tools, preseq, samtools stats, and Qualimap — reimplemented in Rust
//! featureCounts-compatible output file generation.
//!
//! Produces the same output files as Subread's featureCounts:
//! - Main counts file (`.featureCounts.tsv`): per-gene read counts with annotation
//! - Summary file (`.featureCounts.tsv.summary`): assignment statistics
//! - Biotype counts file: counts aggregated by a GTF attribute (e.g. `gene_biotype`)
//! - MultiQC biotype bargraph file: biotype counts formatted for MultiQC

use crate::gtf::Gene;
use crate::rna::dupradar::counting::CountResult;
use anyhow::Result;
use indexmap::IndexMap;
use std::io::Write;
use std::path::Path;

/// Write the featureCounts-compatible main counts file.
///
/// Produces a tab-separated file matching featureCounts output format:
/// - Line 1: comment with program version and command
/// - Line 2: header with column names
/// - Lines 3+: one row per gene with annotation columns and count
///
/// # Columns
/// `Geneid`, `Chr`, `Start`, `End`, `Strand`, `Length`, `<bam_filename>`
pub fn write_counts_file(
    path: &Path,
    genes: &IndexMap<String, Gene>,
    counts: &CountResult,
    bam_path: &str,
    command_line: &str,
) -> Result<()> {
    let file = std::fs::File::create(path)?;
    let mut w = std::io::BufWriter::new(file);

    // Line 1: comment header with program info (matches featureCounts format)
    writeln!(
        w,
        "# Program:featureCounts v2.0.6, generated by RustQC v{}; Command:\"{}\"",
        env!("CARGO_PKG_VERSION"),
        command_line
    )?;

    // Line 2: column header
    let bam_name = Path::new(bam_path)
        .file_name()
        .map(|n| n.to_string_lossy().to_string())
        .unwrap_or_else(|| bam_path.to_string());
    writeln!(w, "Geneid\tChr\tStart\tEnd\tStrand\tLength\t{}", bam_name)?;

    // Data rows
    for (gene_id, gene) in genes.iter() {
        // Skip genes with no exons
        if gene.exons.is_empty() {
            continue;
        }

        // Build semicolon-separated annotation fields from exons
        let chrs: Vec<&str> = gene.exons.iter().map(|e| e.chrom.as_str()).collect();
        let starts: Vec<String> = gene.exons.iter().map(|e| e.start.to_string()).collect();
        let ends: Vec<String> = gene.exons.iter().map(|e| e.end.to_string()).collect();
        let strands: Vec<String> = gene.exons.iter().map(|e| e.strand.to_string()).collect();

        // Use per-read counts for featureCounts output (matches featureCounts -p behavior
        // where each read is counted independently, not as fragments)
        let count = counts
            .gene_counts
            .get(gene_id)
            .map(|gc| gc.fc_reads)
            .unwrap_or(0);

        writeln!(
            w,
            "{}\t{}\t{}\t{}\t{}\t{}\t{}",
            gene_id,
            chrs.join(";"),
            starts.join(";"),
            ends.join(";"),
            strands.join(";"),
            gene.effective_length,
            count,
        )?;
    }

    Ok(())
}

/// Write the featureCounts-compatible summary file.
///
/// Produces a tab-separated file with read assignment statistics.
/// The status categories match featureCounts output.
///
/// When `biotype_in_gtf` is true, uses biotype-level assignment stats
/// (matching `featureCounts -g gene_biotype -B -C` as nf-core/rnaseq runs).
/// Reads overlapping multiple genes of the same biotype are Assigned at the
/// biotype level, not Ambiguous. When false, uses gene-level stats.
pub fn write_summary_file(
    path: &Path,
    counts: &CountResult,
    bam_path: &str,
    biotype_in_gtf: bool,
) -> Result<()> {
    let file = std::fs::File::create(path)?;
    let mut w = std::io::BufWriter::new(file);

    let bam_name = Path::new(bam_path)
        .file_name()
        .map(|n| n.to_string_lossy().to_string())
        .unwrap_or_else(|| bam_path.to_string());

    // When biotype counting is active, use biotype-level Assigned/Ambiguous/
    // NoFeatures to match the nf-core pipeline's `featureCounts -g gene_biotype`
    // summary. Otherwise fall back to gene-level stats.
    let (assigned, ambiguous, no_features) = if biotype_in_gtf {
        (
            counts.fc_biotype_assigned,
            counts.fc_biotype_ambiguous,
            counts.fc_biotype_no_features,
        )
    } else {
        (
            counts.fc_assigned,
            counts.fc_ambiguous,
            counts.fc_no_features,
        )
    };

    writeln!(w, "Status\t{}", bam_name)?;
    writeln!(w, "Assigned\t{}", assigned)?;
    writeln!(w, "Unassigned_Unmapped\t{}", counts.fc_unmapped)?;
    writeln!(w, "Unassigned_Read_Type\t0")?;
    writeln!(w, "Unassigned_Singleton\t{}", counts.fc_singleton)?;
    writeln!(w, "Unassigned_MappingQuality\t0")?;
    writeln!(w, "Unassigned_Chimera\t{}", counts.fc_chimera)?;
    writeln!(w, "Unassigned_FragmentLength\t0")?;
    writeln!(w, "Unassigned_Duplicate\t0")?;
    writeln!(w, "Unassigned_MultiMapping\t{}", counts.fc_multimapping)?;
    writeln!(w, "Unassigned_Secondary\t0")?;
    writeln!(w, "Unassigned_NonSplit\t0")?;
    writeln!(w, "Unassigned_NoFeatures\t{}", no_features)?;
    writeln!(w, "Unassigned_Overlapping_Length\t0")?;
    writeln!(w, "Unassigned_Ambiguity\t{}", ambiguous)?;

    Ok(())
}

/// Aggregate counts by biotype, matching `featureCounts -g gene_biotype` behaviour.
///
/// Uses pre-computed biotype-level read counts from the counting engine, where
/// exons are grouped by biotype into mega-features (reads overlapping multiple
/// genes of the same biotype are Assigned, not Ambiguous).
///
/// Returns an ordered map of biotype_name -> total count, sorted descending.
pub fn aggregate_biotype_counts(counts: &CountResult) -> IndexMap<String, u64> {
    let mut biotype_counts: IndexMap<String, u64> = IndexMap::new();

    for (idx, name) in counts.biotype_names.iter().enumerate() {
        let count = counts.biotype_reads.get(idx).copied().unwrap_or(0);
        biotype_counts.insert(name.clone(), count);
    }

    // Sort by count descending, then alphabetically for stable ordering of ties
    let mut pairs: Vec<(String, u64)> = biotype_counts.into_iter().collect();
    pairs.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    pairs.into_iter().collect()
}

/// Write biotype counts to a plain TSV file.
///
/// Two columns: biotype name and count, tab-separated.
pub fn write_biotype_counts(path: &Path, biotype_counts: &IndexMap<String, u64>) -> Result<()> {
    let file = std::fs::File::create(path)?;
    let mut w = std::io::BufWriter::new(file);

    writeln!(w, "Biotype\tCount")?;
    for (biotype, count) in biotype_counts.iter() {
        writeln!(w, "{}\t{}", biotype, count)?;
    }

    Ok(())
}

/// Write biotype counts formatted for MultiQC custom content (bargraph).
///
/// This matches the format used by the nf-core/rnaseq pipeline's
/// `MULTIQC_CUSTOM_BIOTYPE` process.
pub fn write_biotype_counts_mqc(path: &Path, biotype_counts: &IndexMap<String, u64>) -> Result<()> {
    let file = std::fs::File::create(path)?;
    let mut w = std::io::BufWriter::new(file);

    // MultiQC YAML header
    writeln!(w, "# id: 'biotype_counts'")?;
    writeln!(w, "# section_name: 'Biotype Counts'")?;
    writeln!(
        w,
        "# description: \"shows reads overlapping genomic features of different biotypes,"
    )?;
    writeln!(
        w,
        "#     counted by RustQC based on the <a href='http://bioinf.wehi.edu.au/featureCounts'>featureCounts</a> implementation.\""
    )?;
    writeln!(w, "# plot_type: 'bargraph'")?;
    writeln!(w, "# anchor: 'featurecounts_biotype'")?;
    writeln!(w, "# pconfig:")?;
    writeln!(w, "#     id: \"featurecounts_biotype_plot\"")?;
    writeln!(w, "#     title: \"featureCounts: Biotypes\"")?;
    writeln!(w, "#     xlab: \"# Reads\"")?;
    writeln!(w, "#     cpswitch_counts_label: \"Number of Reads\"")?;

    // Data: biotype\tcount (no header row — MultiQC custom content format)
    for (biotype, count) in biotype_counts.iter() {
        writeln!(w, "{}\t{}", biotype, count)?;
    }

    Ok(())
}

/// Write biotype rRNA percentage for MultiQC general stats.
///
/// Extracts the rRNA count and computes the percentage of total assigned reads.
pub fn write_biotype_rrna_mqc(
    path: &Path,
    biotype_counts: &IndexMap<String, u64>,
    total_assigned: u64,
    sample_name: &str,
) -> Result<()> {
    let file = std::fs::File::create(path)?;
    let mut w = std::io::BufWriter::new(file);

    let rrna_count = biotype_counts.get("rRNA").copied().unwrap_or(0);
    let rrna_pct = if total_assigned > 0 {
        rrna_count as f64 / total_assigned as f64 * 100.0
    } else {
        0.0
    };

    // MultiQC YAML header for general stats
    writeln!(w, "#id: 'biotype-gs'")?;
    writeln!(w, "#plot_type: 'generalstats'")?;
    writeln!(w, "#pconfig:")?;
    writeln!(w, "#    percent_rRNA:")?;
    writeln!(w, "#        title: '% rRNA'")?;
    writeln!(w, "#        namespace: 'Biotype Counts'")?;
    writeln!(
        w,
        "#        description: '% reads overlapping rRNA features'"
    )?;
    writeln!(w, "#        max: 100")?;
    writeln!(w, "#        min: 0")?;
    writeln!(w, "#        scale: 'RdYlGn-rev'")?;
    writeln!(w, "Sample\tpercent_rRNA")?;
    writeln!(w, "'{}'\t{}", sample_name, rrna_pct)?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::rna::dupradar::counting::GeneCounts;
    use std::collections::HashMap;

    fn make_test_gene(gene_id: &str, biotype: Option<&str>) -> Gene {
        let mut attributes = HashMap::new();
        if let Some(bt) = biotype {
            attributes.insert("gene_biotype".to_string(), bt.to_string());
        }
        Gene {
            gene_id: gene_id.to_string(),
            chrom: "chr1".to_string(),
            start: 100,
            end: 200,
            strand: '+',
            exons: vec![crate::gtf::Exon {
                chrom: "chr1".to_string(),
                start: 100,
                end: 200,
                strand: '+',
            }],
            effective_length: 101,
            attributes,
            transcripts: Vec::new(),
        }
    }

    #[test]
    fn test_aggregate_biotype_counts() {
        let mut genes = IndexMap::new();
        genes.insert(
            "gene1".to_string(),
            make_test_gene("gene1", Some("protein_coding")),
        );
        genes.insert(
            "gene2".to_string(),
            make_test_gene("gene2", Some("protein_coding")),
        );
        genes.insert("gene3".to_string(), make_test_gene("gene3", Some("rRNA")));
        genes.insert("gene4".to_string(), make_test_gene("gene4", None));

        let mut gene_counts = IndexMap::new();
        gene_counts.insert(
            "gene1".to_string(),
            GeneCounts {
                all_unique: 100,
                fc_reads: 100,
                ..Default::default()
            },
        );
        gene_counts.insert(
            "gene2".to_string(),
            GeneCounts {
                all_unique: 50,
                fc_reads: 50,
                ..Default::default()
            },
        );
        gene_counts.insert(
            "gene3".to_string(),
            GeneCounts {
                all_unique: 25,
                fc_reads: 25,
                ..Default::default()
            },
        );
        gene_counts.insert(
            "gene4".to_string(),
            GeneCounts {
                all_unique: 10,
                fc_reads: 10,
                ..Default::default()
            },
        );

        // Pre-computed biotype-level counts matching featureCounts -g gene_biotype
        // behaviour: biotype_names[0] = "protein_coding" (150 reads),
        // biotype_names[1] = "rRNA" (25 reads). gene4 has no biotype so is not counted.
        let count_result = CountResult {
            gene_counts,
            stat_total_reads: 200,
            stat_assigned: 185,
            stat_ambiguous: 5,
            stat_no_features: 10,
            stat_total_fragments: 200,
            stat_singleton_unmapped_mates: 0,
            stat_total_mapped: 200,
            stat_total_dup: 0,
            fc_assigned: 185,
            fc_ambiguous: 5,
            fc_no_features: 10,
            fc_multimapping: 0,
            fc_unmapped: 0,
            fc_singleton: 0,
            fc_chimera: 0,
            biotype_reads: vec![150, 25],
            biotype_names: vec!["protein_coding".to_string(), "rRNA".to_string()],
            fc_biotype_assigned: 175,
            fc_biotype_ambiguous: 0,
            fc_biotype_no_features: 10,
            rseqc: None,
            qualimap: None,
        };

        let biotypes = aggregate_biotype_counts(&count_result);

        assert_eq!(biotypes.get("protein_coding"), Some(&150));
        assert_eq!(biotypes.get("rRNA"), Some(&25));
        // gene4 has no biotype attribute, so it's not in biotype_reads
        assert_eq!(biotypes.get("unknown"), None);
    }
}