fibertools_rs/utils/
input_bam.rs

1use crate::cli;
2use crate::fiber::FiberseqRecords;
3use crate::utils::bio_io;
4use clap::{Args, ValueHint};
5use rust_htslib::bam;
6use rust_htslib::bam::Read;
7use std::fmt::Debug;
8
9pub static MIN_ML_SCORE: &str = "125";
10
11/// This struct establishes the way a Fiber-seq bam should be filtered when it is read in.
12/// This struct is both used as an argument for building FiberSeq records and as a struct that is parsed into cli arguments.
13#[derive(Debug, Args, Clone)]
14pub struct FiberFilters {
15    /// BAM bit flags to filter on, equivalent to `-F` in samtools view
16    #[clap(
17        global = true,
18        short = 'F',
19        long = "filter",
20        default_value = "0",
21        help_heading = "BAM-Options"
22    )]
23    pub bit_flag: u16,
24    /// Filtering expression to use for filtering records
25    /// Example: filter to nucleosomes with lengths greater than 150 bp
26    ///   -x "len(nuc)>150"
27    /// Example: filter to msps with lengths between 30 and 49 bp
28    ///   -x "len(msp)=30:50"
29    /// Example: combine 2+ filter expressions
30    ///   -x "len(nuc)<150,len(msp)=30:50"
31    /// Filtering expressions support len() and qual() functions over msp, nuc, m6a, cpg
32    #[clap(
33        global = true,
34        short = 'x',
35        long = "ftx",
36        alias = "ft-expression",
37        help_heading = "BAM-Options"
38    )]
39    pub filter_expression: Option<String>,
40    /// Minium score in the ML tag to use or include in the output
41    #[clap(long="ml", alias="min-ml-score", default_value = MIN_ML_SCORE, help_heading = "BAM-Options", env="FT_MIN_ML_SCORE")]
42    pub min_ml_score: u8,
43    /// strip basemods in the first or last X bp of the read
44    #[clap(
45        global = true,
46        long,
47        default_value = "0",
48        help_heading = "BAM-Options",
49        hide = true
50    )]
51    pub strip_starting_basemods: i64,
52}
53
54impl std::default::Default for FiberFilters {
55    fn default() -> Self {
56        Self {
57            bit_flag: 0,
58            min_ml_score: MIN_ML_SCORE.parse().unwrap(),
59            filter_expression: None,
60            strip_starting_basemods: 0,
61        }
62    }
63}
64
65impl FiberFilters {
66    /// This function accepts an iterator over bam records and filters them based on the bit flag.
67    pub fn filter_on_bit_flags<'a, I>(
68        &'a self,
69        records: I,
70    ) -> impl Iterator<Item = bam::Record> + 'a
71    where
72        I: IntoIterator<Item = Result<bam::Record, rust_htslib::errors::Error>> + 'a,
73    {
74        records
75            .into_iter()
76            .map(|r| r.expect("htslib is unable to read a record in the input."))
77            .filter(|r| {
78                // filter by bit flag
79                (r.flags() & self.bit_flag) == 0
80            })
81    }
82}
83
84/// This struct is used to parse the input bam file and the filters that should be applied to the bam file.
85/// This struct is parsed to create command line arguments and then passed to many functions.
86#[derive(Debug, Args)]
87pub struct InputBam {
88    /// Input BAM file. If no path is provided stdin is used. For m6A prediction, this should be a HiFi bam file with kinetics data. For other commands, this should be a bam file with m6A calls.
89    #[clap(default_value = "-", value_hint = ValueHint::AnyPath)]
90    pub bam: String,
91    #[clap(flatten)]
92    pub filters: FiberFilters,
93    #[clap(flatten)]
94    pub global: cli::GlobalOpts,
95    /// by skipping this field it is not parsed as a command line argument
96    #[clap(skip)]
97    pub header: Option<bam::Header>,
98}
99
100impl InputBam {
101    pub fn bam_reader(&mut self) -> bam::Reader {
102        let mut bam = bio_io::bam_reader(&self.bam);
103        bam.set_threads(self.global.threads)
104            .expect("unable to set threads for bam reader");
105        self.header = Some(bam::Header::from_template(bam.header()));
106        bam
107    }
108
109    pub fn indexed_bam_reader(&mut self) -> bam::IndexedReader {
110        if &self.bam == "-" {
111            panic!("Cannot use stdin (\"-\") for indexed bam reading. Please provide a file path for the bam file.");
112        }
113
114        let mut bam =
115            bam::IndexedReader::from_path(&self.bam).expect("unable to open indexed bam file");
116        self.header = Some(bam::Header::from_template(bam.header()));
117        bam.set_threads(self.global.threads).unwrap();
118        bam
119    }
120
121    pub fn fibers<'a>(&self, bam: &'a mut bam::Reader) -> FiberseqRecords<'a> {
122        FiberseqRecords::new(bam, self.filters.clone())
123    }
124
125    pub fn header_view(&self) -> bam::HeaderView {
126        bam::HeaderView::from_header(self.header.as_ref().expect(
127            "Input bam must be opened before opening the header or creating a writer with the input bam as a template.",
128        ))
129    }
130
131    pub fn header(&self) -> bam::Header {
132        bam::Header::from_template(&self.header_view())
133    }
134
135    pub fn bam_writer(&self, out: &str) -> bam::Writer {
136        let header = self.header();
137        let program_name = "fibertools-rs";
138        let program_id = "ft";
139        let program_version = crate::VERSION;
140        let mut out = crate::utils::bio_io::program_bam_writer_from_header(
141            out,
142            header,
143            program_name,
144            program_id,
145            program_version,
146        );
147        out.set_threads(self.global.threads)
148            .expect("unable to set threads for bam writer");
149        out
150    }
151}
152
153impl std::default::Default for InputBam {
154    fn default() -> Self {
155        Self {
156            bam: "-".to_string(),
157            filters: FiberFilters::default(),
158            global: cli::GlobalOpts::default(),
159            header: None,
160        }
161    }
162}