fq_filter_reads/
lib.rs

1pub mod cli;
2
3use bio::io::fastq;
4use bio::io::fastq::Record;
5use flate2::read::GzDecoder;
6use log::debug;
7use std::collections::HashSet;
8use std::fs::File;
9use std::io::BufRead;
10use std::io::BufReader;
11use std::io::Read;
12
13/// Print a read if read id in id list
14///
15/// # Arguments
16/// * `record` - a fastq record
17/// * `id_list` - hash table of id list
18/// * `inverse` - true if throw away reads with id in the id list, false otherwise
19///
20/// # Return
21/// * 1 if the read is printed, 0 otherwise
22///
23/// # Example
24/// ```
25/// use bio::io::fastq::Record;
26/// use std::collections::HashSet;
27/// use fq_filter_reads::process_read;
28///
29/// // mock record
30/// let record1 = Record::with_attrs(
31///     "a",
32///     None,
33///     b"AAA",
34///     b"AAA",
35/// );
36/// let record2 = Record::with_attrs(
37///     "b",
38///     None,
39///     b"AAA",
40///     b"AAA",
41/// );
42///
43/// // mock hash table
44/// let mut hash: HashSet<String> = HashSet::new();
45/// hash.insert("a".to_string());
46/// hash.insert("t".to_string());
47///
48/// assert_eq!(process_read(&record1, &hash, false).unwrap(), 1);
49/// assert_eq!(process_read(&record2, &hash, false).unwrap(), 0);
50///
51/// assert_eq!(process_read(&record1, &hash, true).unwrap(), 0);
52/// assert_eq!(process_read(&record2, &hash, true).unwrap(), 1);
53///
54/// ```
55pub fn process_read(
56    record: &Record,
57    id_list: &HashSet<String>,
58    inverse: bool,
59) -> Result<u64, String> {
60    let mut out_count = 0;
61    let seq_id: String = record.id().to_string();
62    let read_in_list: bool = id_list.contains(&seq_id);
63    if (read_in_list && !inverse) | (!read_in_list && inverse) {
64        let record_string = record.to_string();
65        let record_fmt = record_string.strip_suffix('\n').ok_or("Bad record")?;
66        println!("{record_fmt}");
67        out_count += 1;
68    }
69    Ok(out_count)
70}
71
72/// Filter out reads with ID not in the given id list
73///
74/// # Arguments
75/// * `fastq_file` - file path to the fastq file for filtering
76/// * `id_list` - hash table storing the hashed read names that won't be filtered out
77///
78/// # Returns
79/// - tuple (total read count, number of reads retained)
80///
81/// # Example
82/// ```
83/// use std::collections::HashSet;
84/// use std::io::prelude::*;
85/// use std::fs::File;
86/// use flate2::GzBuilder;
87/// use flate2::Compression;
88/// use fq_filter_reads::filter_fq;
89///
90/// // mock data
91/// let tmp_fq = "/tmp/test.fq.gz";
92/// let f = File::create(tmp_fq).unwrap();
93/// let mut gz = GzBuilder::new()
94///     .filename("test.fq.gz")
95///     .comment("test file")
96///     .write(f, Compression::default());
97/// let data = b"@a\nAAA\n+\nAAA\n@c\nCCC\n+\nCCC\n@t\nTTT\n+\nTTT";
98/// gz.write_all(data).unwrap();
99/// gz.finish().unwrap();
100///
101/// let mut hash: HashSet<String> = HashSet::new();
102/// hash.insert("a".to_string());
103/// hash.insert("t".to_string());
104///
105/// let (read_count, out_count) = filter_fq(tmp_fq, &hash, false).unwrap();
106/// assert_eq!(read_count, 3);
107/// assert_eq!(out_count, 2);
108///
109/// let (read_count2, out_count2) = filter_fq(tmp_fq, &hash, true).unwrap();
110/// assert_eq!(read_count2, 3);
111/// assert_eq!(out_count2, 1);
112/// ```
113pub fn filter_fq(
114    fastq_file: &str,
115    id_list: &HashSet<String>,
116    inverse: bool,
117) -> Result<(u64, u64), String> {
118    let is_gz_input = fastq_file.ends_with(".gz");
119    //let reader = fastq::Reader::from_file(fastq_file).map_err(|e| e.to_string())?;
120    let mut read_count = 0;
121    let mut out_count = 0;
122
123    let file = File::open(fastq_file).map_err(|e| e.to_string())?;
124    // solution from:
125    // https://users.rust-lang.org/t/solved-optional-bufreader-gzdecoder-or-bufreader-file/24714/2
126    let buf: Box<dyn Read> = match is_gz_input {
127        true => Box::new(GzDecoder::new(file)),
128        false => Box::new(file),
129    };
130    let reader = fastq::Reader::new(buf);
131    for result in reader.records() {
132        let record = result.map_err(|e| e.to_string())?;
133        let oc = process_read(&record, id_list, inverse)?;
134        read_count += 1;
135        out_count += oc
136    }
137
138    Ok((read_count, out_count))
139}
140
141/// Getting fastq record ID from a file
142///
143/// # Arguments
144/// * `id_file` - file path to the id list
145///
146/// # Returns
147/// * `id_list` - the hash table for all ids
148///
149/// # Example
150/// ```
151/// use std::fs;
152/// use fq_filter_reads::get_list;
153/// let data = "a\nb\nc";
154/// let filename = "/tmp/id_file";
155/// fs::write(filename, data).unwrap();
156///
157/// let ids = get_list(filename).unwrap();
158///
159/// assert!(ids.contains(&"a".to_string()));
160/// assert!(ids.contains(&"b".to_string()));
161/// assert!(ids.contains(&"c".to_string()));
162/// assert!(!ids.contains(&"d".to_string()));
163///
164/// ```
165pub fn get_list(id_file: &str) -> Result<HashSet<String>, String> {
166    let file = File::open(id_file).map_err(|e| e.to_string())?;
167    let reader = BufReader::new(file);
168
169    let mut id_set = HashSet::new();
170    for line in reader.lines() {
171        let id = line.map_err(|e| e.to_string())?;
172        debug!("{}", id);
173        id_set.insert(id);
174    }
175    Ok(id_set)
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181    use flate2::{Compression, GzBuilder};
182    use rstest::*;
183    use std::io::prelude::*;
184
185    fn mock_input(gz: bool) -> String {
186        let data = b"@a\nAAA\n+\nAAA\n@c\nCCC\n+\nCCC\n@t\nTTT\n+\nTTT";
187        let mut tmp_fq = "/tmp/test.fq".to_string();
188        // mock data
189        if gz {
190            tmp_fq = format!("{tmp_fq}.gz");
191            let f = File::create(&tmp_fq).unwrap();
192            let mut gz = GzBuilder::new()
193                .filename("test.fq.gz")
194                .comment("test file")
195                .write(f, Compression::default());
196            gz.write_all(data).unwrap();
197            gz.finish().unwrap();
198        } else {
199            let mut file = File::create(&tmp_fq).unwrap();
200            file.write_all(data).unwrap();
201        }
202        tmp_fq
203    }
204
205    #[rstest]
206    #[case(true)]
207    #[case(false)]
208    fn test_filter_fq(#[case] gz: bool) {
209        let tmp_fq_string = mock_input(gz);
210        let tmp_fq = tmp_fq_string.as_str();
211
212        let mut hash: HashSet<String> = HashSet::new();
213        hash.insert("a".to_string());
214        hash.insert("t".to_string());
215
216        let (read_count, out_count) = filter_fq(&tmp_fq, &hash, false).unwrap();
217        assert_eq!(read_count, 3);
218        assert_eq!(out_count, 2);
219
220        let (read_count2, out_count2) = filter_fq(&tmp_fq, &hash, true).unwrap();
221        assert_eq!(read_count2, 3);
222        assert_eq!(out_count2, 1);
223    }
224}