filter_clipped/
lib.rs

1pub mod cli;
2pub mod clipping;
3
4use cli::Parser;
5use clipping::ClipStat;
6
7use log::{debug, info};
8use rust_htslib::{
9    bam,
10    bam::{record::CigarStringView, Header, Read, Reader, Record},
11};
12
13/// Workflow to process an input bam file and write the pass-filter alignments
14/// into a new bam file
15///
16/// # Arguments
17/// - `in_bam`: input bam file
18/// - `out_bam`: output bam file
19/// - `inverse`: boolean flag to indicate whether we want to write out the failed-filter alignments only
20/// - `both_end`: maximum fraction of total clipped bases relative to the read sequence length to consider as pass
21/// - `left_side`: maximum fraction of of clipped bases on either side relative to the read sequence length to consider as pass
22/// - `right_side`: maximum fraction of of clipped bases on either side relative to the read sequence length to consider as pass
23///
24/// # Examples
25///
26/// ```
27/// use filter_clipped::run;
28/// use rust_htslib::bam;
29/// use rust_htslib::bam::Read;
30/// fn count_bam(bam_file: String, expected_count: i32) {
31///     // helper function to verify result
32///     let mut bam_reader = bam::Reader::from_path(bam_file).unwrap();
33///     let mut aln_count = 0;
34///     for r in bam_reader.records() {
35///         let _record = r.unwrap();
36///         aln_count += 1;
37///     }
38///     assert_eq!(expected_count, aln_count);
39/// }
40///
41/// let out_bam = "out.sam";
42/// run(
43///     "test/data/test.sam".to_string(),
44///     out_bam.to_string(),
45///     false,
46///     0.1,
47///     0.1,
48///     0.1,
49///     false,
50///     );
51/// count_bam(out_bam.to_string(), 6);
52/// ```
53pub fn run(
54    in_bam: String,
55    out_bam: String,
56    inverse: bool,
57    both_end: f64,
58    left_side: f64,
59    right_side: f64,
60    unalign: bool,
61) -> Result<u8, String> {
62    let mut out_count: u32 = 0;
63    let mut in_count: u32 = 0;
64    let mut unaligned_count: u32 = 0;
65    info!("Reading from alignment file: {}", in_bam);
66    info!("Writing to alignment file: {}", out_bam);
67    info!(
68        "Thresholds: trailing clipped: {}, leading clipped: {}, total clipped: {}",
69        right_side, left_side, both_end
70    );
71    let mut in_bam: Reader = match in_bam.eq("-") {
72        true => bam::Reader::from_stdin().map_err(|e| e.to_string())?,
73        _ => bam::Reader::from_path(&in_bam).map_err(|e| e.to_string())?,
74    };
75    let header: Header = bam::Header::from_template(in_bam.header());
76
77    let mut out_bam = match out_bam.eq("-") {
78        true => bam::Writer::from_stdout(&header, bam::Format::Bam).map_err(|e| e.to_string())?,
79        _ => bam::Writer::from_path(&out_bam, &header, bam::Format::Bam)
80            .map_err(|e| e.to_string())?,
81    };
82
83    for r in in_bam.records() {
84        in_count += 1;
85        let mut record: Record = r.map_err(|e| e.to_string())?;
86        let seq_len: f64 = record.seq().len() as f64;
87        let cigar: CigarStringView = record.cigar();
88
89        let leading_clipped: Vec<i64> = vec![cigar.leading_softclips(), cigar.leading_hardclips()];
90        let trailing_cliped: Vec<i64> =
91            vec![cigar.trailing_softclips(), cigar.trailing_hardclips()];
92
93        let clip_stat: ClipStat = ClipStat::new(leading_clipped, trailing_cliped);
94
95        let keep: bool = clip_stat.total_fraction(seq_len)? < both_end
96            && clip_stat.left_fraction(seq_len)? <= left_side
97            && clip_stat.right_fraction(seq_len)? <= right_side;
98
99        debug!("{:?} {}", clip_stat, seq_len);
100        if !(unalign) {
101            if (keep && !inverse) || (inverse && !keep) {
102                out_bam.write(&record).map_err(|e| e.to_string())?;
103                out_count += 1;
104            }
105        } else {
106            if keep {
107                out_bam.write(&record).map_err(|e| e.to_string())?;
108            } else {
109                record.set_unmapped();
110                record.unset_reverse();
111                record.unset_proper_pair();
112                record.set_tid(-1);
113                record.set_pos(-1);
114                out_bam.write(&record).map_err(|e| e.to_string())?;
115                unaligned_count += 1
116            }
117            out_count += 1;
118        }
119    }
120    info!(
121        "Read {} alignments; Written {} alignments; Making {} to unaligned",
122        in_count, out_count, unaligned_count,
123    );
124    Ok(0) // exit code 0
125}
126
127/// Just a wrapper function to read command line arguments and pass it to `run`
128///
129pub fn wrapper() {
130    let args = cli::Command::parse();
131    let result = run(
132        args.in_bam,
133        args.out_bam,
134        args.inverse,
135        args.both_end,
136        args.left_side,
137        args.right_side,
138        args.unalign,
139    );
140    match result {
141        Ok(_) => (),
142        Err(err) => println!("{}", err),
143    };
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149    use rstest::rstest;
150    use std::string::String;
151
152    fn count_bam(bam_file: String, expected_count: i32, expected_unaligned: i32) {
153        let mut bam_reader = bam::Reader::from_path(bam_file).unwrap();
154        let mut aln_count = 0;
155        let mut unaligned_count = 0;
156        for r in bam_reader.records() {
157            let _record = r.unwrap();
158            aln_count += 1;
159            if _record.is_unmapped() {
160                unaligned_count += 1;
161            }
162        }
163        assert_eq!(expected_count, aln_count);
164
165        if expected_unaligned > 0 {
166            assert_eq!(expected_unaligned, unaligned_count);
167        }
168    }
169
170    #[rstest]
171    #[case(1, 0.2, 0.3, true, 0, 0, false)]
172    #[case(2, 0.2, 0.3, false, 9, 0, false)]
173    #[case(3, 0.1, 0.1, false, 6, 0, false)]
174    #[case(4, 0.1, 0.1, false, 9, 3, true)]
175    #[case(4, 0.1, 0.1, false, 9, 3, true)]
176    fn test_run(
177        #[case] test_case: usize,
178        #[case] max_both_end: f64,
179        #[case] max_single_end: f64,
180        #[case] inverse: bool,
181        #[case] expected_count: i32,
182        #[case] expected_unaligned: i32,
183        #[case] unalign: bool,
184    ) {
185        let out_bam: &str = &format!("test/data/out_{}.bam", test_case);
186        let result = run(
187            "test/data/test.sam".to_string(),
188            out_bam.to_string(),
189            inverse,
190            max_both_end,
191            max_single_end,
192            max_single_end,
193            unalign,
194        )
195        .unwrap();
196        assert_eq!(result, 0);
197        count_bam(out_bam.to_string(), expected_count, expected_unaligned);
198    }
199}