use flate2::write::GzEncoder;
use flate2::Compression;
use needletail::parse_fastx_file;
use serde_json::json;
use sha2::{Digest, Sha256, Sha512};
use std::fs::OpenOptions;
use std::io::{BufWriter, Write};
use std::path::PathBuf;
struct PrepStats {
num_trimmed: u64,
num_records: u64,
}
impl PrepStats {
fn new() -> Self {
Self {
num_trimmed: 0,
num_records: 0,
}
}
}
struct Signatures {
name_hash_256: Sha256,
seq_hash_256: Sha256,
name_hash_512: Sha512,
seq_hash_512: Sha512,
}
impl Signatures {
fn new() -> Self {
Self {
name_hash_256: Sha256::new(),
seq_hash_256: Sha256::new(),
name_hash_512: Sha512::new(),
seq_hash_512: Sha512::new(),
}
}
fn add_name(&mut self, n: &[u8]) {
self.name_hash_256.update(n);
self.name_hash_512.update(n);
}
fn add_seq(&mut self, s: &[u8]) {
self.seq_hash_256.update(s);
self.seq_hash_512.update(s);
}
}
pub struct RecordParseConfig {
pub input: Vec<String>,
pub output_stem: PathBuf,
pub polya_clip_length: Option<usize>,
}
fn stream_with_clip(
filenames: &[String],
poly_a: &[u8],
output_filename: PathBuf,
mut stats: PrepStats,
mut sigs: Signatures,
) -> anyhow::Result<Vec<PathBuf>> {
let seq_filename = output_filename.with_extension("fa.gz");
let seq_file = OpenOptions::new()
.write(true)
.truncate(true)
.create(true)
.open(seq_filename.clone())?;
let seq_file = GzEncoder::new(seq_file, Compression::default());
let mut seq_file = BufWriter::new(seq_file);
for filename in filenames.iter() {
let mut reader = parse_fastx_file(filename).expect("valid path/file");
while let Some(record) = reader.next() {
let seqrec = record.expect("invalid record");
let seqid = seqrec.id();
let seq = seqrec.seq();
stats.num_records += 1;
sigs.add_name(seqid);
sigs.add_seq(&seq);
if seq.ends_with(poly_a) {
let owned = String::from_utf8(seq.into_owned()).unwrap();
let trimmed = owned.trim_end_matches('A');
stats.num_trimmed += 1;
writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
writeln!(&mut seq_file, "{}", trimmed)?;
} else {
writeln!(&mut seq_file, ">{}", String::from_utf8_lossy(seqrec.id()))?;
writeln!(&mut seq_file, "{}", String::from_utf8_lossy(&seq))?;
}
}
}
let sig_file = OpenOptions::new()
.write(true)
.truncate(true)
.create(true)
.open(output_filename.with_extension("json"))?;
eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
write_sig_file(sig_file, &stats, sigs)?;
Ok(vec![seq_filename])
}
fn write_sig_file(
sig_file: std::fs::File,
stats: &PrepStats,
sigs: Signatures,
) -> anyhow::Result<()> {
let mut sig_file = BufWriter::new(sig_file);
let v = json!({
"sha256_names" : hex::encode(sigs.name_hash_256.finalize()),
"sha512_names" : hex::encode(sigs.name_hash_512.finalize()),
"sha512_seqs" : hex::encode(sigs.seq_hash_512.finalize()),
"sha256_seqs" : hex::encode(sigs.seq_hash_256.finalize()),
"num_records" : stats.num_records,
"num_trimmed_polya" : stats.num_trimmed
});
write!(&mut sig_file, "{}", v)?;
Ok(())
}
fn stream_for_signatures(
filenames: &[String],
output_filename: PathBuf,
mut stats: PrepStats,
mut sigs: Signatures,
) -> anyhow::Result<()> {
for filename in filenames.iter() {
let mut reader = parse_fastx_file(filename).expect("valid path/file");
while let Some(record) = reader.next() {
let seqrec = record.expect("invalid record");
let seqid = seqrec.id();
let seq = seqrec.seq();
stats.num_records += 1;
sigs.add_name(seqid);
sigs.add_seq(&seq);
}
}
let sig_file = OpenOptions::new()
.write(true)
.truncate(true)
.create(true)
.open(output_filename.with_extension("json"))?;
eprintln!("trimmed polyA tails from {} records", stats.num_trimmed);
write_sig_file(sig_file, &stats, sigs)
}
pub fn parse_records(args: RecordParseConfig) -> anyhow::Result<Vec<PathBuf>> {
let filenames = args.input;
let clip_polya;
let poly_a = if let Some(alen) = args.polya_clip_length {
clip_polya = true;
vec![b'A'; alen]
} else {
clip_polya = false;
vec![]
};
let output_filename = args.output_stem;
let stats = PrepStats::new();
let sigs = Signatures::new();
if clip_polya {
stream_with_clip(&filenames, &poly_a, output_filename, stats, sigs)
} else {
stream_for_signatures(&filenames, output_filename, stats, sigs)?;
Ok(filenames.iter().map(PathBuf::from).collect())
}
}