fibertools_rs/utils/
input_bam.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
use crate::cli;
use crate::fiber::FiberseqRecords;
use crate::utils::bio_io;
use clap::{Args, ValueHint};
use rust_htslib::bam;
use rust_htslib::bam::Read;
use std::fmt::Debug;

pub static MIN_ML_SCORE: &str = "125";

/// This struct establishes the way a Fiber-seq bam should be filtered when it is read in.
/// This struct is both used as an argument for building FiberSeq records and as a struct that is parsed into cli arguments.
#[derive(Debug, Args, Clone)]
pub struct FiberFilters {
    /// BAM bit flags to filter on, equivalent to `-F` in samtools view
    #[clap(
        global = true,
        short = 'F',
        long = "filter",
        default_value = "0",
        help_heading = "BAM-Options"
    )]
    pub bit_flag: u16,
    /// Filtering expression to use for filtering records
    /// Example: filter to nucleosomes with lengths greater than 150 bp
    ///   -x "len(nuc)>150"
    /// Example: filter to msps with lengths between 30 and 49 bp
    ///   -x "len(msp)=30:50"
    /// Example: combine 2+ filter expressions
    ///   -x "len(nuc)<150,len(msp)=30:50"
    /// Filtering expressions support len() and qual() functions over msp, nuc, m6a, cpg
    #[clap(
        global = true,
        short = 'x',
        long = "ftx",
        alias = "ft-expression",
        help_heading = "BAM-Options"
    )]
    pub filter_expression: Option<String>,
    /// Minium score in the ML tag to use or include in the output
    #[clap(long="ml", alias="min-ml-score", default_value = MIN_ML_SCORE, help_heading = "BAM-Options", env="FT_MIN_ML_SCORE")]
    pub min_ml_score: u8,
    /// strip basemods in the first or last X bp of the read
    #[clap(
        global = true,
        long,
        default_value = "0",
        help_heading = "BAM-Options",
        hide = true
    )]
    pub strip_starting_basemods: i64,
}

impl std::default::Default for FiberFilters {
    fn default() -> Self {
        Self {
            bit_flag: 0,
            min_ml_score: MIN_ML_SCORE.parse().unwrap(),
            filter_expression: None,
            strip_starting_basemods: 0,
        }
    }
}

impl FiberFilters {
    /// This function accepts an iterator over bam records and filters them based on the bit flag.
    pub fn filter_on_bit_flags<'a, I>(
        &'a self,
        records: I,
    ) -> impl Iterator<Item = bam::Record> + 'a
    where
        I: IntoIterator<Item = Result<bam::Record, rust_htslib::errors::Error>> + 'a,
    {
        records
            .into_iter()
            .map(|r| r.expect("htslib is unable to read a record in the input."))
            .filter(|r| {
                // filter by bit flag
                (r.flags() & self.bit_flag) == 0
            })
    }
}

/// This struct is used to parse the input bam file and the filters that should be applied to the bam file.
/// This struct is parsed to create command line arguments and then passed to many functions.
#[derive(Debug, Args)]
pub struct InputBam {
    /// Input BAM file. If no path is provided stdin is used. For m6A prediction, this should be a HiFi bam file with kinetics data. For other commands, this should be a bam file with m6A calls.
    #[clap(default_value = "-", value_hint = ValueHint::AnyPath)]
    pub bam: String,
    #[clap(flatten)]
    pub filters: FiberFilters,
    #[clap(flatten)]
    pub global: cli::GlobalOpts,
    /// by skipping this field it is not parsed as a command line argument
    #[clap(skip)]
    pub header: Option<bam::Header>,
}

impl InputBam {
    pub fn bam_reader(&mut self) -> bam::Reader {
        let mut bam = bio_io::bam_reader(&self.bam);
        bam.set_threads(self.global.threads)
            .expect("unable to set threads for bam reader");
        self.header = Some(bam::Header::from_template(bam.header()));
        bam
    }

    pub fn indexed_bam_reader(&mut self) -> bam::IndexedReader {
        if &self.bam == "-" {
            panic!("Cannot use stdin (\"-\") for indexed bam reading. Please provide a file path for the bam file.");
        }

        let mut bam =
            bam::IndexedReader::from_path(&self.bam).expect("unable to open indexed bam file");
        self.header = Some(bam::Header::from_template(bam.header()));
        bam.set_threads(self.global.threads).unwrap();
        bam
    }

    pub fn fibers<'a>(&self, bam: &'a mut bam::Reader) -> FiberseqRecords<'a> {
        FiberseqRecords::new(bam, self.filters.clone())
    }

    pub fn header_view(&self) -> bam::HeaderView {
        bam::HeaderView::from_header(self.header.as_ref().expect(
            "Input bam must be opened before opening the header or creating a writer with the input bam as a template.",
        ))
    }

    pub fn header(&self) -> bam::Header {
        bam::Header::from_template(&self.header_view())
    }

    pub fn bam_writer(&self, out: &str) -> bam::Writer {
        let header = self.header();
        let program_name = "fibertools-rs";
        let program_id = "ft";
        let program_version = crate::VERSION;
        let mut out = crate::utils::bio_io::program_bam_writer_from_header(
            out,
            header,
            program_name,
            program_id,
            program_version,
        );
        out.set_threads(self.global.threads)
            .expect("unable to set threads for bam writer");
        out
    }
}

impl std::default::Default for InputBam {
    fn default() -> Self {
        Self {
            bam: "-".to_string(),
            filters: FiberFilters::default(),
            global: cli::GlobalOpts::default(),
            header: None,
        }
    }
}