fibertools_rs/utils/
input_bam.rs

1use crate::cli;
2use crate::fiber::FiberseqRecords;
3use crate::utils::bio_io;
4use clap::{Args, ValueHint};
5use rust_htslib::bam;
6use rust_htslib::bam::Read;
7use std::fmt::Debug;
8
9pub static MIN_ML_SCORE: &str = "125";
10
11/// This struct establishes the way a Fiber-seq bam should be filtered when it is read in.
12/// This struct is both used as an argument for building FiberSeq records and as a struct that is parsed into cli arguments.
13#[derive(Debug, Args, Clone)]
14pub struct FiberFilters {
15    /// BAM bit flags to filter on, equivalent to `-F` in samtools view
16    #[clap(
17        global = true,
18        short = 'F',
19        long = "filter",
20        default_value = "0",
21        help_heading = "BAM-Options"
22    )]
23    pub bit_flag: u16,
24    /// Filtering expression to use for filtering records
25    /// Example: filter to nucleosomes with lengths greater than 150 bp
26    ///   -x "len(nuc)>150"
27    /// Example: filter to msps with lengths between 30 and 49 bp
28    ///   -x "len(msp)=30:50"
29    /// Example: combine 2+ filter expressions
30    ///   -x "len(nuc)<150,len(msp)=30:50"
31    /// Filtering expressions support len() and qual() functions over msp, nuc, m6a, cpg
32    #[clap(
33        global = true,
34        short = 'x',
35        long = "ftx",
36        alias = "ft-expression",
37        help_heading = "BAM-Options"
38    )]
39    pub filter_expression: Option<String>,
40    /// Minium score in the ML tag to use or include in the output
41    #[clap(long="ml", alias="min-ml-score", default_value = MIN_ML_SCORE, help_heading = "BAM-Options", env="FT_MIN_ML_SCORE")]
42    pub min_ml_score: u8,
43    /// Output uncompressed BAM files
44    #[clap(help_heading = "BAM-Options", short, long)]
45    pub uncompressed: bool,
46    /// strip basemods in the first or last X bp of the read
47    #[clap(
48        global = true,
49        long,
50        default_value = "0",
51        help_heading = "BAM-Options",
52        hide = true
53    )]
54    pub strip_starting_basemods: i64,
55}
56
57impl std::default::Default for FiberFilters {
58    fn default() -> Self {
59        Self {
60            bit_flag: 0,
61            min_ml_score: MIN_ML_SCORE.parse().unwrap(),
62            filter_expression: None,
63            uncompressed: false,
64            strip_starting_basemods: 0,
65        }
66    }
67}
68
69impl FiberFilters {
70    /// This function accepts an iterator over bam records and filters them based on the bit flag.
71    pub fn filter_on_bit_flags<'a, I>(
72        &'a self,
73        records: I,
74    ) -> impl Iterator<Item = bam::Record> + 'a
75    where
76        I: IntoIterator<Item = Result<bam::Record, rust_htslib::errors::Error>> + 'a,
77    {
78        records
79            .into_iter()
80            .map(|r| r.expect("htslib is unable to read a record in the input."))
81            .filter(|r| {
82                // filter by bit flag
83                (r.flags() & self.bit_flag) == 0
84            })
85    }
86}
87
88/// This struct is used to parse the input bam file and the filters that should be applied to the bam file.
89/// This struct is parsed to create command line arguments and then passed to many functions.
90#[derive(Debug, Args)]
91pub struct InputBam {
92    /// Input BAM file. If no path is provided stdin is used. For m6A prediction, this should be a HiFi bam file with kinetics data. For other commands, this should be a bam file with m6A calls.
93    #[clap(default_value = "-", value_hint = ValueHint::AnyPath)]
94    pub bam: String,
95    #[clap(flatten)]
96    pub filters: FiberFilters,
97    #[clap(flatten)]
98    pub global: cli::GlobalOpts,
99    /// by skipping this field it is not parsed as a command line argument
100    #[clap(skip)]
101    pub header: Option<bam::Header>,
102}
103
104impl InputBam {
105    pub fn bam_reader(&mut self) -> bam::Reader {
106        let mut bam = bio_io::bam_reader(&self.bam);
107        bam.set_threads(self.global.threads)
108            .expect("unable to set threads for bam reader");
109        self.header = Some(bam::Header::from_template(bam.header()));
110        bam
111    }
112
113    pub fn indexed_bam_reader(&mut self) -> bam::IndexedReader {
114        if &self.bam == "-" {
115            panic!("Cannot use stdin (\"-\") for indexed bam reading. Please provide a file path for the bam file.");
116        }
117
118        let mut bam =
119            bam::IndexedReader::from_path(&self.bam).expect("unable to open indexed bam file");
120        self.header = Some(bam::Header::from_template(bam.header()));
121        bam.set_threads(self.global.threads).unwrap();
122        bam
123    }
124
125    pub fn fibers<'a>(&self, bam: &'a mut bam::Reader) -> FiberseqRecords<'a> {
126        FiberseqRecords::new(bam, self.filters.clone())
127    }
128
129    pub fn header_view(&self) -> bam::HeaderView {
130        bam::HeaderView::from_header(self.header.as_ref().expect(
131            "Input bam must be opened before opening the header or creating a writer with the input bam as a template.",
132        ))
133    }
134
135    pub fn header(&self) -> bam::Header {
136        bam::Header::from_template(&self.header_view())
137    }
138
139    pub fn bam_writer(&self, out: &str) -> bam::Writer {
140        let header = self.header();
141        let program_name = "fibertools-rs";
142        let program_id = "ft";
143        let program_version = crate::VERSION;
144        let mut out = crate::utils::bio_io::program_bam_writer_from_header(
145            out,
146            header,
147            program_name,
148            program_id,
149            program_version,
150        );
151        out.set_threads(self.global.threads)
152            .expect("unable to set threads for bam writer");
153        if self.filters.uncompressed {
154            out.set_compression_level(bam::CompressionLevel::Uncompressed)
155                .expect("Unable to set compression level to uncompressed");
156        }
157        out
158    }
159
160    /// Add panSN-spec prefix to all contig names in the stored header
161    /// This must be called after opening the BAM reader to have an effect
162    pub fn add_pansn_prefix(&mut self, pansn_prefix: &str) {
163        if let Some(ref mut header) = self.header {
164            *header = crate::utils::panspec::add_pan_spec_header(header, pansn_prefix);
165        }
166    }
167
168    /// Strip panSN-spec information from all contig names in the stored header
169    /// using the provided delimiter character (e.g., '#' for panSN format)
170    /// This must be called after opening the BAM reader to have an effect
171    pub fn strip_pansn_spec(&mut self, delimiter: char) {
172        if let Some(ref mut header) = self.header {
173            *header = crate::utils::panspec::strip_pan_spec_header(header, &delimiter);
174        }
175    }
176}
177
178impl std::default::Default for InputBam {
179    fn default() -> Self {
180        Self {
181            bam: "-".to_string(),
182            filters: FiberFilters::default(),
183            global: cli::GlobalOpts::default(),
184            header: None,
185        }
186    }
187}