rustynetics 0.1.4

A high-performance genomics libary specialized in handling BAM and BigWig files
Documentation
use std::process;
use std::thread;

use clap::{Arg, ArgAction, Command};

use rustynetics::genome::Genome;
use rustynetics::stringset::StringSet;
use rustynetics::tf::{TFMatrix, PWM};
use rustynetics::track::MutableTrack;
use rustynetics::track_generic::GenericTrack;
use rustynetics::track_simple::SimpleTrack;

mod common;

fn genome_from_stringset(sequences: &StringSet) -> Genome {
    let mut seqnames: Vec<_> = sequences.keys().cloned().collect();
    seqnames.sort();
    let lengths = seqnames
        .iter()
        .map(|name| sequences[name.as_str()].len())
        .collect();
    Genome::new(seqnames, lengths)
}

fn summarize(values: &[f64], summary_name: &str) -> f64 {
    if values.is_empty() {
        return f64::NAN;
    }
    match summary_name {
        "mean" => values.iter().sum::<f64>() / values.len() as f64,
        "max" => values.iter().copied().fold(f64::NEG_INFINITY, f64::max),
        "min" => values.iter().copied().fold(f64::INFINITY, f64::min),
        "discrete mean" => (values.iter().sum::<f64>() / values.len() as f64).floor() + 0.5,
        _ => unreachable!(),
    }
}

fn scan_sequence_bins(pwm: &PWM, sequence: &[u8], summary_name: &str, bin_size: usize) -> Vec<f64> {
    let motif_len = pwm.matrix.length();
    if motif_len == 0 || sequence.len() < motif_len {
        return Vec::new();
    }

    let n_bins = sequence.len().div_ceil(bin_size);
    let last_start = sequence.len() - motif_len + 1;
    let mut result = Vec::with_capacity(n_bins);

    for i in 0..n_bins {
        let start = i * bin_size;
        let end = ((i + 1) * bin_size).min(last_start);
        if start >= end {
            break;
        }

        let mut scores = Vec::with_capacity((end - start) * 2);
        for j in start..end {
            scores.push(
                pwm.matrix
                    .score(&sequence[j..j + motif_len], false, 0.0, |a, b| a + b)
                    .unwrap(),
            );
            scores.push(
                pwm.matrix
                    .score(&sequence[j..j + motif_len], true, 0.0, |a, b| a + b)
                    .unwrap(),
            );
        }
        result.push(summarize(&scores, summary_name));
    }

    result
}

fn main() {
    let matches = Command::new("pwm-scan-sequences")
        .about("Scan FASTA sequences with a PWM and export a BigWig track")
        .arg(
            Arg::new("bin-summary")
                .long("bin-summary")
                .default_value("max")
                .value_parser(["mean", "max", "min", "discrete mean"]),
        )
        .arg(Arg::new("bin-size").long("bin-size").default_value("10"))
        .arg(Arg::new("threads").long("threads").default_value("1"))
        .arg(
            Arg::new("verbose")
                .short('v')
                .long("verbose")
                .action(ArgAction::Count),
        )
        .arg(Arg::new("pwm").required(true).index(1))
        .arg(Arg::new("fasta").required(true).index(2))
        .arg(Arg::new("output").required(true).index(3))
        .get_matches();

    let pwm_path = matches.get_one::<String>("pwm").unwrap();
    let fasta_path = matches.get_one::<String>("fasta").unwrap();
    let output_path = matches.get_one::<String>("output").unwrap();
    let summary_name = matches.get_one::<String>("bin-summary").unwrap();
    let bin_size: usize = matches
        .get_one::<String>("bin-size")
        .unwrap()
        .parse()
        .unwrap_or_else(|error| {
            eprintln!("invalid bin size: {error}");
            process::exit(1);
        });
    let threads: usize = matches
        .get_one::<String>("threads")
        .unwrap()
        .parse()
        .unwrap_or_else(|error| {
            eprintln!("invalid number of threads: {error}");
            process::exit(1);
        });
    let verbose = matches.get_count("verbose") > 0;
    if threads == 0 {
        eprintln!("invalid number of threads: must be at least 1");
        process::exit(1);
    }

    let mut matrix = TFMatrix::empty();
    if verbose {
        eprintln!("Reading PWM `{}`...", pwm_path);
    }
    if let Err(error) = matrix.import_matrix(pwm_path) {
        eprintln!("reading PWM failed: {error}");
        process::exit(1);
    }
    let pwm = PWM::new(matrix);

    let mut sequences = StringSet::empty();
    if verbose {
        eprintln!("Reading FASTA `{}`...", fasta_path);
    }
    if let Err(error) = sequences.import_fasta(fasta_path) {
        eprintln!("reading FASTA failed: {error}");
        process::exit(1);
    }

    let genome = genome_from_stringset(&sequences);
    let mut track = SimpleTrack::alloc(String::new(), genome.clone(), f64::NAN, bin_size);
    let ranges = common::worker_ranges(genome.seqnames.len(), threads);
    let results = if ranges.len() <= 1 {
        genome
            .seqnames
            .iter()
            .map(|seqname| {
                if verbose {
                    eprintln!("Scanning sequence `{seqname}`...");
                }
                (
                    seqname.clone(),
                    scan_sequence_bins(&pwm, &sequences[seqname.as_str()], summary_name, bin_size),
                )
            })
            .collect::<Vec<_>>()
    } else {
        thread::scope(|scope| {
            let mut handles = Vec::new();
            for (start, end) in ranges {
                let seqnames = &genome.seqnames[start..end];
                let sequences = &sequences;
                let pwm = pwm.clone();
                let summary_name = summary_name.to_string();
                handles.push(scope.spawn(move || {
                    seqnames
                        .iter()
                        .map(|seqname| {
                            (
                                seqname.clone(),
                                scan_sequence_bins(
                                    &pwm,
                                    &sequences[seqname.as_str()],
                                    &summary_name,
                                    bin_size,
                                ),
                            )
                        })
                        .collect::<Vec<_>>()
                }));
            }

            let mut merged = Vec::with_capacity(genome.seqnames.len());
            for handle in handles {
                merged.extend(handle.join().unwrap_or_else(|_| {
                    eprintln!("pwm-scan-sequences worker thread panicked");
                    process::exit(1);
                }));
            }
            merged
        })
    };

    for (seqname, values) in results {
        let mut target = track.get_sequence_mut(&seqname).unwrap_or_else(|error| {
            eprintln!("accessing track sequence failed: {error}");
            process::exit(1);
        });
        for (i, value) in values.into_iter().enumerate() {
            target.set_bin(i, value);
        }
    }

    if let Err(error) = GenericTrack::wrap(&track).export_bigwig(output_path, vec![]) {
        eprintln!("writing BigWig failed: {error}");
        process::exit(1);
    }
}