1use std::collections::HashSet;
2use std::io::Write;
3use std::path::PathBuf;
4
5extern crate needletail;
6use needletail::parser::{write_fasta, LineEnding};
7use needletail::FastxReader;
8
9use crate::fastq::{open_fastx, suffix_file_name};
10use crate::io::get_writer;
11use crate::utils::styled_progress_bar;
12
13fn trim_seq_id(input: &[u8]) -> Vec<u8> {
14 input
15 .iter()
16 .copied()
17 .by_ref()
18 .take_while(|&x| x != b' ')
19 .collect()
20}
21
22fn subsample_fasta<F: Fn()>(
23 seq_names: &HashSet<Vec<u8>>,
24 mut reader: Box<dyn FastxReader>,
25 writer: &mut dyn Write,
26 callback: &Option<F>,
27) {
28 let total = seq_names.len();
29 let progress_bar = styled_progress_bar(total, "Subsampling FASTA");
30
31 while let Some(record) = reader.next() {
32 let seqrec = record.as_ref().expect("invalid record");
33 let seq_id: Vec<u8> = trim_seq_id(seqrec.id());
34 if seq_names.contains(&seq_id) {
35 write_fasta(seqrec.id(), &seqrec.seq(), writer, LineEnding::Unix)
36 .expect("Unable to write FASTA");
37 progress_bar.inc(1);
38 if progress_bar.position() as usize == total {
39 break;
40 }
41 }
42
43 if let Some(cb) = callback {
44 cb()
45 }
46 }
47 progress_bar.finish();
48}
49
50pub fn subsample<F: Fn()>(
51 seq_names: &HashSet<Vec<u8>>,
52 fasta_path: &Option<PathBuf>,
53 fasta_out: &bool,
54 suffix: &String,
55 callback: &Option<F>,
56) {
57 if fasta_path.is_none() {
58 return;
59 }
60 if !fasta_out {
61 return;
62 }
63
64 let reader = open_fastx(fasta_path);
65 let out_path = suffix_file_name(fasta_path.as_ref().unwrap(), suffix);
66 let mut writer = get_writer(&Some(out_path));
67
68 if let Some(r) = reader {
69 subsample_fasta(seq_names, r, &mut *writer, callback);
70 }
71}