Skip to main content

blobtk/
fasta.rs

1use std::collections::HashSet;
2use std::io::Write;
3use std::path::PathBuf;
4
5extern crate needletail;
6use needletail::parser::{write_fasta, LineEnding};
7use needletail::FastxReader;
8
9use crate::fastq::{open_fastx, suffix_file_name};
10use crate::io::get_writer;
11use crate::utils::styled_progress_bar;
12
13fn trim_seq_id(input: &[u8]) -> Vec<u8> {
14    input
15        .iter()
16        .copied()
17        .by_ref()
18        .take_while(|&x| x != b' ')
19        .collect()
20}
21
22fn subsample_fasta<F: Fn()>(
23    seq_names: &HashSet<Vec<u8>>,
24    mut reader: Box<dyn FastxReader>,
25    writer: &mut dyn Write,
26    callback: &Option<F>,
27) {
28    let total = seq_names.len();
29    let progress_bar = styled_progress_bar(total, "Subsampling FASTA");
30
31    while let Some(record) = reader.next() {
32        let seqrec = record.as_ref().expect("invalid record");
33        let seq_id: Vec<u8> = trim_seq_id(seqrec.id());
34        if seq_names.contains(&seq_id) {
35            write_fasta(seqrec.id(), &seqrec.seq(), writer, LineEnding::Unix)
36                .expect("Unable to write FASTA");
37            progress_bar.inc(1);
38            if progress_bar.position() as usize == total {
39                break;
40            }
41        }
42
43        if let Some(cb) = callback {
44            cb()
45        }
46    }
47    progress_bar.finish();
48}
49
50pub fn subsample<F: Fn()>(
51    seq_names: &HashSet<Vec<u8>>,
52    fasta_path: &Option<PathBuf>,
53    fasta_out: &bool,
54    suffix: &String,
55    callback: &Option<F>,
56) {
57    if fasta_path.is_none() {
58        return;
59    }
60    if !fasta_out {
61        return;
62    }
63
64    let reader = open_fastx(fasta_path);
65    let out_path = suffix_file_name(fasta_path.as_ref().unwrap(), suffix);
66    let mut writer = get_writer(&Some(out_path));
67
68    if let Some(r) = reader {
69        subsample_fasta(seq_names, r, &mut *writer, callback);
70    }
71}