prepare_fasta 0.2.0

Compute hash-based signatures of sequence, and perform pre-processing
Documentation
use anyhow::Context;
use flate2::write::GzEncoder;
use flate2::Compression;
use needletail::parse_fastx_file;
use path_tools::WithAdditionalExtension;
use serde_json::json;
use sha2::{Digest, Sha256, Sha512};
use std::ffi::OsStr;
use std::fs::OpenOptions;
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};

struct PrepStats {
    num_trimmed: u64,
    num_records: u64,
}

impl PrepStats {
    fn new() -> Self {
        Self {
            num_trimmed: 0,
            num_records: 0,
        }
    }
}

struct Signatures {
    name_hash_256: Sha256,
    seq_hash_256: Sha256,
    name_hash_512: Sha512,
    seq_hash_512: Sha512,
}

impl Signatures {
    fn new() -> Self {
        Self {
            name_hash_256: Sha256::new(),
            seq_hash_256: Sha256::new(),
            name_hash_512: Sha512::new(),
            seq_hash_512: Sha512::new(),
        }
    }

    fn add_name(&mut self, n: &[u8]) {
        self.name_hash_256.update(n);
        self.name_hash_512.update(n);
    }

    fn add_seq(&mut self, s: &[u8]) {
        self.seq_hash_256.update(s);
        self.seq_hash_512.update(s);
    }
}

/// Holds information relevant to the parsing, processing and output
pub struct RecordParseConfig<T: AsRef<Path>> {
    pub input: Vec<T>,
    pub output_stem: PathBuf,
    pub polya_clip_length: Option<usize>,
}

/// Stream through the input sequences, clipping the polyA tails of the specified
/// length and computing the statistics and signatures.
fn stream_with_clip<T: AsRef<Path>>(
    filenames: &[T],
    poly_a: &[u8],
    output_filename: PathBuf,
    mut stats: PrepStats,
    mut sigs: Signatures,
) -> anyhow::Result<Vec<PathBuf>> {
    let seq_filename = output_filename.with_extension("fa.gz");
    let seq_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(seq_filename.clone())?;

    let seq_file = GzEncoder::new(seq_file, Compression::default());
    let mut seq_file = BufWriter::new(seq_file);

    for filename in filenames.iter() {
        let mut reader = parse_fastx_file(filename).context("invalid input path/file")?;
        while let Some(record) = reader.next() {
            let seqrec = record.context("couldn't parse FASTA record")?;
            let seqid = seqrec.id();
            let seq = seqrec.seq();
            stats.num_records += 1;
            sigs.add_name(seqid);
            sigs.add_seq(&seq);

            if seq.ends_with(poly_a) {
                let owned = String::from_utf8(seq.into_owned()).unwrap();
                let trimmed = owned.trim_end_matches('A');
                stats.num_trimmed += 1;
                writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
                writeln!(&mut seq_file, "{}", trimmed)?;
            } else {
                writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
                writeln!(&mut seq_file, "{}", String::from_utf8_lossy(&seq))?;
            }
        }
    }

    let output_filename = output_filename.with_additional_extension(".json");
    let sig_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(output_filename)?;

    eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
    write_sig_file(sig_file, &stats, sigs)?;
    Ok(vec![seq_filename])
}

/// Write the output signature and statistics file
/// in `JSON` format to `sig_file`.
fn write_sig_file(
    sig_file: std::fs::File,
    stats: &PrepStats,
    sigs: Signatures,
) -> anyhow::Result<()> {
    let mut sig_file = BufWriter::new(sig_file);

    let v = json!({
        "sha256_names" : hex::encode(sigs.name_hash_256.finalize()),
        "sha512_names" : hex::encode(sigs.name_hash_512.finalize()),
        "sha512_seqs" : hex::encode(sigs.seq_hash_512.finalize()),
        "sha256_seqs" : hex::encode(sigs.seq_hash_256.finalize()),
        "num_records" : stats.num_records,
        "num_trimmed_polya" : stats.num_trimmed
    });

    write!(&mut sig_file, "{}", v)?;
    Ok(())
}

/// Stream through the input sequences, computing the statistics and signatures.
fn stream_for_signatures<T: AsRef<Path>>(
    filenames: &[T],
    output_filename: PathBuf,
    mut stats: PrepStats,
    mut sigs: Signatures,
) -> anyhow::Result<()> {
    for filename in filenames.iter() {
        let mut reader = parse_fastx_file(filename).context("invalid input path/file")?;
        while let Some(record) = reader.next() {
            let seqrec = record.context("couldn't parse FASTA record")?;
            let seqid = seqrec.id();
            let seq = seqrec.seq();
            stats.num_records += 1;
            sigs.add_name(seqid);
            sigs.add_seq(&seq);
        }
    }

    let output_filename = output_filename.with_additional_extension(".json");
    let sig_file = OpenOptions::new()
        .write(true)
        .truncate(true)
        .create(true)
        .open(output_filename)?;

    //eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
    write_sig_file(sig_file, &stats, sigs)
}

/// Parses the list of input files according to the configuration.
/// If modifications are to be made (e.g. polyA clipping is enabled), then
/// it will return a Vec<PathBuf> containing a single element; the file path
/// of the modified reference it created. If no alterations are being made
/// and instead the signatures are just being computed, then it will return
/// a Vec<PathBuf> containing the paths to the initial input files it was
/// given.
pub fn parse_records<T: AsRef<Path> + AsRef<OsStr>>(
    args: RecordParseConfig<T>,
) -> anyhow::Result<Vec<PathBuf>> {
    let filenames = args.input;
    let output_filename = args.output_stem;
    let stats = PrepStats::new();
    let sigs = Signatures::new();
    if let Some(alen) = args.polya_clip_length {
        let poly_a = vec![b'A'; alen];
        stream_with_clip(&filenames, &poly_a, output_filename, stats, sigs)
    } else {
        stream_for_signatures(&filenames, output_filename, stats, sigs)?;
        Ok(filenames.iter().map(PathBuf::from).collect())
    }
}