fastqc-rust 1.0.1

A Rust rewrite of FastQC - a quality control tool for high throughput sequence data
Documentation
use std::path::PathBuf;
use std::process;

use clap::Parser;

use fastqc_rust::config::{FastQCConfig, TemplateName};
use fastqc_rust::runner;

/// FastQC - A high throughput sequence QC analysis tool
#[derive(Parser, Debug)]
#[command(name = "fastqc", version = fastqc_rust::VERSION_BANNER, about)]
struct Cli {
    /// Create report files in the specified output directory.
    /// The directory must already exist.
    #[arg(short, long, value_name = "DIR")]
    outdir: Option<PathBuf>,

    /// Force the file format. Valid formats: bam, sam, bam_mapped, sam_mapped, fastq.
    #[arg(short, long, value_name = "FORMAT")]
    format: Option<String>,

    /// Files were generated by an Illumina CASAVA pipeline version >= 1.8.
    /// Sequences flagged as filtered will be excluded from the analysis.
    #[arg(long)]
    casava: bool,

    /// Files come from nanopore sequences and are in fast5 format.
    /// In this mode you can pass in directories to process.
    #[arg(long)]
    nano: bool,

    /// If running with --casava, don't remove reads flagged by CASAVA as poor quality.
    #[arg(long)]
    nofilter: bool,

    /// Disable grouping of bases for reads > 50bp.
    /// All reports will show data for every base in the read.
    /// WARNING: Using this option with very long reads may cause excessive memory use.
    #[arg(long)]
    nogroup: bool,

    /// Use exponential base groups in report.
    #[arg(long)]
    expgroup: bool,

    /// Extract the zipped report file after creating it.
    /// By default files are not extracted after creation.
    #[arg(long)]
    extract: bool,

    /// Do not extract the zipped report file after creating it (default behavior).
    #[arg(long)]
    noextract: bool,

    /// Delete the zipped output file after it has been extracted.
    /// Only has an effect if --extract is also specified.
    #[arg(long)]
    delete: bool,

    /// Specifies a non-default file which contains the list of contaminants to
    /// screen overrepresented sequences against.
    #[arg(short = 'c', long, value_name = "FILE")]
    contaminants: Option<PathBuf>,

    /// Specifies a non-default file which contains the list of adapter sequences
    /// which will be explicitly searched against the library.
    #[arg(short, long, value_name = "FILE")]
    adapters: Option<PathBuf>,

    /// Specifies a non-default file which contains a set of criteria used to
    /// determine the warn/error limits for the various modules.
    #[arg(short, long, value_name = "FILE")]
    limits: Option<PathBuf>,

    /// Specifies the number of files which can be processed simultaneously.
    /// Each thread will be allocated 250MB of memory.
    #[arg(short, long, value_name = "N", default_value = "1")]
    threads: usize,

    /// Specifies the length of Kmer to look for in the Kmer content module.
    /// Specified Kmer length must be between 2 and 10. Default length is 7.
    #[arg(short, long, value_name = "N", default_value = "7")]
    kmers: u8,

    /// Suppress all progress messages on stdout and only report errors.
    #[arg(short, long)]
    quiet: bool,

    /// Selects a directory to be used for temporary files written when
    /// generating report images. Defaults to system temp dir.
    #[arg(short, long, value_name = "DIR")]
    dir: Option<PathBuf>,

    /// Sets an artificial lower limit on the length of the sequence to be shown
    /// in the report. Sequences shorter than this limit will not be shown.
    // Java uses --min_length (underscore), clap defaults to --min-length (hyphen).
    // Allow both forms.
    #[arg(
        long = "min_length",
        alias = "min-length",
        value_name = "N",
        default_value = "0"
    )]
    min_length: usize,

    /// Specifies the truncation length used for duplicate detection.
    /// Reads longer than this value will be truncated before checking for duplicates.
    // Java uses --dup_length (underscore)
    #[arg(
        long = "dup_length",
        alias = "dup-length",
        value_name = "N",
        default_value = "0"
    )]
    dup_length: usize,

    /// Save images as SVG files as well as PNG.
    #[arg(long)]
    svg: bool,

    /// Select the HTML report template.
    /// "classic" produces the original FastQC report layout.
    /// "modern" uses a redesigned layout with responsive sidebar and help text.
    #[arg(short = 't', long, value_name = "NAME", default_value = "classic")]
    template: TemplateName,

    /// Input files (one or more FastQ, BAM, or SAM files).
    #[arg(required = true)]
    files: Vec<PathBuf>,
}

fn main() {
    let cli = Cli::parse();

    // Validate kmer range
    if cli.kmers < 2 || cli.kmers > 10 {
        eprintln!(
            "Error: kmer size must be between 2 and 10, got {}",
            cli.kmers
        );
        process::exit(1);
    }

    // Validate format if provided
    if let Some(ref fmt) = cli.format {
        match fmt.as_str() {
            "bam" | "sam" | "bam_mapped" | "sam_mapped" | "fastq" => {}
            _ => {
                eprintln!(
                    "Error: unrecognized format '{}'. \
                     Valid formats: bam, sam, bam_mapped, sam_mapped, fastq",
                    fmt
                );
                process::exit(1);
            }
        }
    }

    // Validate output directory exists if specified
    if let Some(ref dir) = cli.outdir {
        if !dir.is_dir() {
            eprintln!(
                "Error: output directory '{}' does not exist or is not a directory",
                dir.display()
            );
            process::exit(1);
        }
    }

    // Build config
    let do_unzip = if cli.extract {
        Some(true)
    } else if cli.noextract {
        Some(false)
    } else {
        None
    };

    let config = FastQCConfig {
        nogroup: cli.nogroup,
        expgroup: cli.expgroup,
        quiet: cli.quiet,
        kmer_size: cli.kmers,
        threads: cli.threads,
        output_dir: cli.outdir,
        casava: cli.casava,
        nano: cli.nano,
        nofilter: cli.nofilter,
        do_unzip,
        delete_after_unzip: cli.delete,
        sequence_format: cli.format,
        contaminant_file: cli.contaminants,
        adapter_file: cli.adapters,
        limits_file: cli.limits,
        min_length: cli.min_length,
        dup_length: cli.dup_length,
        svg_output: cli.svg,
        temp_dir: cli.dir,
        template: cli.template,
    };

    if let Err(exit_code) = runner::run(&config, &cli.files) {
        process::exit(exit_code);
    }
}