fq_filter_reads/lib.rs
1pub mod cli;
2
3use bio::io::fastq;
4use bio::io::fastq::Record;
5use flate2::read::GzDecoder;
6use log::debug;
7use std::collections::HashSet;
8use std::fs::File;
9use std::io::BufRead;
10use std::io::BufReader;
11use std::io::Read;
12
13/// Print a read if read id in id list
14///
15/// # Arguments
16/// * `record` - a fastq record
17/// * `id_list` - hash table of id list
18/// * `inverse` - true if throw away reads with id in the id list, false otherwise
19///
20/// # Return
21/// * 1 if the read is printed, 0 otherwise
22///
23/// # Example
24/// ```
25/// use bio::io::fastq::Record;
26/// use std::collections::HashSet;
27/// use fq_filter_reads::process_read;
28///
29/// // mock record
30/// let record1 = Record::with_attrs(
31/// "a",
32/// None,
33/// b"AAA",
34/// b"AAA",
35/// );
36/// let record2 = Record::with_attrs(
37/// "b",
38/// None,
39/// b"AAA",
40/// b"AAA",
41/// );
42///
43/// // mock hash table
44/// let mut hash: HashSet<String> = HashSet::new();
45/// hash.insert("a".to_string());
46/// hash.insert("t".to_string());
47///
48/// assert_eq!(process_read(&record1, &hash, false).unwrap(), 1);
49/// assert_eq!(process_read(&record2, &hash, false).unwrap(), 0);
50///
51/// assert_eq!(process_read(&record1, &hash, true).unwrap(), 0);
52/// assert_eq!(process_read(&record2, &hash, true).unwrap(), 1);
53///
54/// ```
55pub fn process_read(
56 record: &Record,
57 id_list: &HashSet<String>,
58 inverse: bool,
59) -> Result<u64, String> {
60 let mut out_count = 0;
61 let seq_id: String = record.id().to_string();
62 let read_in_list: bool = id_list.contains(&seq_id);
63 if (read_in_list && !inverse) | (!read_in_list && inverse) {
64 let record_string = record.to_string();
65 let record_fmt = record_string.strip_suffix('\n').ok_or("Bad record")?;
66 println!("{record_fmt}");
67 out_count += 1;
68 }
69 Ok(out_count)
70}
71
72/// Filter out reads with ID not in the given id list
73///
74/// # Arguments
75/// * `fastq_file` - file path to the fastq file for filtering
76/// * `id_list` - hash table storing the hashed read names that won't be filtered out
77///
78/// # Returns
79/// - tuple (total read count, number of reads retained)
80///
81/// # Example
82/// ```
83/// use std::collections::HashSet;
84/// use std::io::prelude::*;
85/// use std::fs::File;
86/// use flate2::GzBuilder;
87/// use flate2::Compression;
88/// use fq_filter_reads::filter_fq;
89///
90/// // mock data
91/// let tmp_fq = "/tmp/test.fq.gz";
92/// let f = File::create(tmp_fq).unwrap();
93/// let mut gz = GzBuilder::new()
94/// .filename("test.fq.gz")
95/// .comment("test file")
96/// .write(f, Compression::default());
97/// let data = b"@a\nAAA\n+\nAAA\n@c\nCCC\n+\nCCC\n@t\nTTT\n+\nTTT";
98/// gz.write_all(data).unwrap();
99/// gz.finish().unwrap();
100///
101/// let mut hash: HashSet<String> = HashSet::new();
102/// hash.insert("a".to_string());
103/// hash.insert("t".to_string());
104///
105/// let (read_count, out_count) = filter_fq(tmp_fq, &hash, false).unwrap();
106/// assert_eq!(read_count, 3);
107/// assert_eq!(out_count, 2);
108///
109/// let (read_count2, out_count2) = filter_fq(tmp_fq, &hash, true).unwrap();
110/// assert_eq!(read_count2, 3);
111/// assert_eq!(out_count2, 1);
112/// ```
113pub fn filter_fq(
114 fastq_file: &str,
115 id_list: &HashSet<String>,
116 inverse: bool,
117) -> Result<(u64, u64), String> {
118 let is_gz_input = fastq_file.ends_with(".gz");
119 //let reader = fastq::Reader::from_file(fastq_file).map_err(|e| e.to_string())?;
120 let mut read_count = 0;
121 let mut out_count = 0;
122
123 let file = File::open(fastq_file).map_err(|e| e.to_string())?;
124 // solution from:
125 // https://users.rust-lang.org/t/solved-optional-bufreader-gzdecoder-or-bufreader-file/24714/2
126 let buf: Box<dyn Read> = match is_gz_input {
127 true => Box::new(GzDecoder::new(file)),
128 false => Box::new(file),
129 };
130 let reader = fastq::Reader::new(buf);
131 for result in reader.records() {
132 let record = result.map_err(|e| e.to_string())?;
133 let oc = process_read(&record, id_list, inverse)?;
134 read_count += 1;
135 out_count += oc
136 }
137
138 Ok((read_count, out_count))
139}
140
141/// Getting fastq record ID from a file
142///
143/// # Arguments
144/// * `id_file` - file path to the id list
145///
146/// # Returns
147/// * `id_list` - the hash table for all ids
148///
149/// # Example
150/// ```
151/// use std::fs;
152/// use fq_filter_reads::get_list;
153/// let data = "a\nb\nc";
154/// let filename = "/tmp/id_file";
155/// fs::write(filename, data).unwrap();
156///
157/// let ids = get_list(filename).unwrap();
158///
159/// assert!(ids.contains(&"a".to_string()));
160/// assert!(ids.contains(&"b".to_string()));
161/// assert!(ids.contains(&"c".to_string()));
162/// assert!(!ids.contains(&"d".to_string()));
163///
164/// ```
165pub fn get_list(id_file: &str) -> Result<HashSet<String>, String> {
166 let file = File::open(id_file).map_err(|e| e.to_string())?;
167 let reader = BufReader::new(file);
168
169 let mut id_set = HashSet::new();
170 for line in reader.lines() {
171 let id = line.map_err(|e| e.to_string())?;
172 debug!("{}", id);
173 id_set.insert(id);
174 }
175 Ok(id_set)
176}
177
178#[cfg(test)]
179mod tests {
180 use super::*;
181 use flate2::{Compression, GzBuilder};
182 use rstest::*;
183 use std::io::prelude::*;
184
185 fn mock_input(gz: bool) -> String {
186 let data = b"@a\nAAA\n+\nAAA\n@c\nCCC\n+\nCCC\n@t\nTTT\n+\nTTT";
187 let mut tmp_fq = "/tmp/test.fq".to_string();
188 // mock data
189 if gz {
190 tmp_fq = format!("{tmp_fq}.gz");
191 let f = File::create(&tmp_fq).unwrap();
192 let mut gz = GzBuilder::new()
193 .filename("test.fq.gz")
194 .comment("test file")
195 .write(f, Compression::default());
196 gz.write_all(data).unwrap();
197 gz.finish().unwrap();
198 } else {
199 let mut file = File::create(&tmp_fq).unwrap();
200 file.write_all(data).unwrap();
201 }
202 tmp_fq
203 }
204
205 #[rstest]
206 #[case(true)]
207 #[case(false)]
208 fn test_filter_fq(#[case] gz: bool) {
209 let tmp_fq_string = mock_input(gz);
210 let tmp_fq = tmp_fq_string.as_str();
211
212 let mut hash: HashSet<String> = HashSet::new();
213 hash.insert("a".to_string());
214 hash.insert("t".to_string());
215
216 let (read_count, out_count) = filter_fq(&tmp_fq, &hash, false).unwrap();
217 assert_eq!(read_count, 3);
218 assert_eq!(out_count, 2);
219
220 let (read_count2, out_count2) = filter_fq(&tmp_fq, &hash, true).unwrap();
221 assert_eq!(read_count2, 3);
222 assert_eq!(out_count2, 1);
223 }
224}