1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
use super::nucleosomes::*;
use anstyle;
use clap::{Args, Command, CommandFactory, Parser, Subcommand, ValueHint};
use clap_complete::{generate, Generator, Shell};
use std::{fmt::Debug, io};

pub static MIN_ML_SCORE: &str = "125";

pub fn get_styles() -> clap::builder::Styles {
    let cmd_color = anstyle::AnsiColor::Magenta;
    let header_color = anstyle::AnsiColor::BrightGreen;
    let placeholder_color = anstyle::AnsiColor::Cyan;
    let header_style = anstyle::Style::new()
        .bold()
        .underline()
        .fg_color(Some(anstyle::Color::Ansi(header_color)));
    let cmd_style = anstyle::Style::new()
        .bold()
        .fg_color(Some(anstyle::Color::Ansi(cmd_color)));
    let placeholder_style = anstyle::Style::new()
        //.bold()
        .fg_color(Some(anstyle::Color::Ansi(placeholder_color)));
    clap::builder::Styles::styled()
        .header(header_style)
        .literal(cmd_style)
        .usage(header_style)
        .placeholder(placeholder_style)
}

#[derive(Parser)]
#[clap(
    author,
    version,
    about,
    propagate_version = true,
    subcommand_required = true,
    infer_subcommands = true,
    arg_required_else_help = true
)]
#[command(version = super::LONG_VERSION)]
#[command(styles=get_styles())]
pub struct Cli {
    /// Threads
    #[clap(
        global = true,
        short,
        long,
        default_value_t = 8,
        help_heading = "Global-Options"
    )]
    pub threads: usize,
    /// Logging level [-v: Info, -vv: Debug, -vvv: Trace]
    #[clap(
        global = true,
        short,
        long,
        action = clap::ArgAction::Count,
        help_heading = "Debug-Options"
    )]
    pub verbose: u8,
    /// Turn off all logging
    #[clap(global = true, long, help_heading = "Debug-Options")]
    pub quiet: bool,
    /// Subcommands for fibertools-rs
    #[clap(subcommand)]
    pub command: Option<Commands>,
}

///
/// This structure contains all the subcommands for fiberseq-rs and their help descriptions.
///
#[derive(Subcommand)]
pub enum Commands {
    /// Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. Also adds nucleosome (nl, ns) and MTase sensitive patches (al, as).
    #[clap(visible_aliases = &["m6A", "m6a"])]
    PredictM6A(PredictM6AOptions),
    /// Add nucleosomes to a bam file with m6a predictions
    AddNucleosomes(AddNucleosomeOptions),
    /// Add FIREs (Fiber-seq Inferred Regulatory Elements) to a bam file with m6a predictions
    Fire(FireOptions),
    /// Add footprints to a bam file with m6a predictions
    Footprint(FootprintOptions),
    /// Extract fiberseq data into plain text files.
    /// See https://fiberseq.github.io/fibertools-rs/docs/extract.html for a description of the outputs.
    #[clap(visible_aliases = &["ex", "e"])]
    Extract(ExtractOptions),
    /// This command centers fiberseq data around given reference positions. This is useful for making aggregate m6A and CpG observations, as well as visualization of SVs.
    ///  See https://fiberseq.github.io/fibertools-rs/docs/center.html for a description of the output.
    #[clap(visible_aliases = &["c", "ct"])]
    Center(CenterOptions),
    /// Make decorated bed files for fiberseq data
    TrackDecorators(DecoratorOptions),
    /// Remove HiFi kinetics tags from the input bam file
    ClearKinetics {
        /// Bam HiFi file with kinetics
        #[clap(default_value = "-")]
        bam: String,
        /// Output bam file without hifi kinetics
        #[clap(default_value = "-")]
        out: String,
    },
    /// Strip out select base modifications
    StripBasemods {
        /// Bam HiFi file with base mods
        #[clap(default_value = "-")]
        bam: String,
        /// Output bam file
        #[clap(default_value = "-")]
        out: String,
        #[clap(short, long, default_value = "m6A",  value_parser(["m6A","6mA", "5mC","CpG"]))]
        /// base modification to strip out of the bam file
        basemod: String,
    },
    /// Make command line completions
    #[clap(hide = true)]
    Completions {
        /// If provided, outputs the completion file for given shell
        #[arg(value_enum)]
        shell: Shell,
    },
    /// Make a man page for fibertools-rs
    ///
    /// Writes file for `man` to stdout.
    #[clap(hide = true)]
    Man {},
}

pub fn print_completions<G: Generator>(gen: G, cmd: &mut Command) {
    generate(gen, cmd, cmd.get_name().to_string(), &mut io::stdout());
}

pub fn make_cli_parse() -> Cli {
    Cli::parse()
}

pub fn make_cli_app() -> Command {
    Cli::command()
}

#[derive(Args, Debug, PartialEq, Eq)]
pub struct PredictM6AOptions {
    /// Bam HiFi file with kinetics
    #[clap(default_value = "-")]
    pub bam: String,
    /// Output bam file with m6A calls in new/extended MM and ML bam tags
    #[clap(default_value = "-")]
    pub out: String,
    /// Minium nucleosome length
    #[clap(short, long, default_value = NUC_LEN)]
    pub nucleosome_length: i64,
    /// Minium nucleosome length when combining over a single m6A
    #[clap(short, long, default_value = COMBO_NUC_LEN)]
    pub combined_nucleosome_length: i64,
    /// Minium distance needed to add to an already existing nuc by crossing an m6a
    #[clap(long, default_value = MIN_DIST_ADDED)]
    pub min_distance_added: i64,
    /// Minimum distance from the end of a fiber to call a nucleosome or MSP
    #[clap(short, long, default_value = DIST_FROM_END)]
    pub distance_from_end: i64,
    /// Most m6A events we can skip over to get to the nucleosome length when using D-segment algorithm. 2 is often a good value, negative values disable D-segment for the simple caller.
    #[clap(long, default_value = ALLOWED_SKIPS, hide = true)]
    pub allowed_m6a_skips: i64,
    /// Keep hifi kinetics data
    #[clap(short, long)]
    pub keep: bool,
    /// Set a minimum ML score to keep on instead of using the model specific minimum ML score.
    #[clap(short, long, help_heading = "Developer-Options")]
    pub min_ml_score: Option<u8>,
    /// Keep all m6A calls regardless of how low the ML value is
    #[clap(short, long, help_heading = "Developer-Options")]
    pub all_calls: bool,
    /// Number of reads to include in batch prediction
    ///
    /// Increasing improves GPU performance at the cost of memory.
    #[clap(short, long, default_value = "1", help_heading = "Developer-Options")]
    pub batch_size: usize,
}

impl std::default::Default for PredictM6AOptions {
    fn default() -> Self {
        Self {
            bam: "-".to_string(),
            out: "-".to_string(),
            nucleosome_length: NUC_LEN.parse().unwrap(),
            combined_nucleosome_length: COMBO_NUC_LEN.parse().unwrap(),
            min_distance_added: MIN_DIST_ADDED.parse().unwrap(),
            distance_from_end: DIST_FROM_END.parse().unwrap(),
            allowed_m6a_skips: ALLOWED_SKIPS.parse().unwrap(),
            keep: false,
            min_ml_score: None,
            all_calls: false,
            batch_size: 1,
        }
    }
}

#[derive(Args, Debug, PartialEq, Eq)]
pub struct FireOptions {
    /// Bam HiFi file with m6A and MSP calls
    #[clap(default_value = "-")]
    pub bam: String,
    /// Output file (bam by default, table if --feats_to_text is used, and bed9 + if --extract is used)
    #[clap(default_value = "-")]
    pub out: String,
    /// Output just FIRE elements in bed9 format
    #[clap(short, long)]
    pub extract: bool,
    /// Don't write reads with no m6A calls to the output bam
    #[clap(short, long)]
    pub skip_no_m6a: bool,
    /// Skip reads without at least `N` MSP calls
    #[clap(long, default_value = "0")]
    pub min_msp: usize,
    /// Skip reads without an average MSP size greater than `N`
    #[clap(long, default_value = "0")]
    pub min_ave_msp_size: i64,
    /// Width of bin for feature collection
    #[clap(short, long, default_value = "40")]
    pub width_bin: i64,
    /// Number of bins to collect
    #[clap(short, long, default_value = "9")]
    pub bin_num: i64,
    /// Calculate stats for the highest X bp window within each MSP
    /// Should be a fair amount higher than the expected linker length.
    #[clap(long, default_value = "100")]
    pub best_window_size: i64,
    /// Use 5mC data in FIREs
    #[clap(short, long)]
    pub use_5mc: bool,
    /// Minium length of msp to call a FIRE
    #[clap(short, long, default_value = "85")]
    pub min_msp_length_for_positive_fire_call: i64,
    /// optional path to a model json file
    #[clap(long)]
    pub model: Option<String>,
    /// Optional path to a FDR table
    #[clap(long)]
    pub fdr_table: Option<String>,
    /// Output FIREs features for training in a table format
    #[clap(short, long)]
    pub feats_to_text: bool,
}

#[derive(Args, Debug)]
pub struct AddNucleosomeOptions {
    /// Bam HiFi file with m6A calls
    #[clap(default_value = "-")]
    pub bam: String,
    /// Output bam file with nucleosome calls
    #[clap(default_value = "-")]
    pub out: String,
    /// Minium nucleosome length
    #[clap(short, long, default_value = NUC_LEN)]
    pub nucleosome_length: i64,
    /// Minium nucleosome length when combining over a single m6A
    #[clap(short, long, default_value = COMBO_NUC_LEN)]
    pub combined_nucleosome_length: i64,
    /// Minium distance needed to add to an already existing nuc by crossing an m6a
    #[clap(short, long, default_value = MIN_DIST_ADDED)]
    pub min_distance_added: i64,
    /// Minimum distance from the end of a fiber to call a nucleosome or MSP
    #[clap(short, long, default_value = DIST_FROM_END)]
    pub distance_from_end: i64,
    /// Most m6A events we can skip over to get to the nucleosome length when using D-segment algorithm. 2 is often a good value, negative values disable D-segment for the simple caller.
    #[clap(short, long, default_value = ALLOWED_SKIPS, hide = true)]
    pub allowed_m6a_skips: i64,
    /// Minium score in the ML tag to use in predicting nucleosomes
    #[clap(long, default_value = MIN_ML_SCORE)]
    pub min_ml_score: u8,
}
impl AddNucleosomeOptions {
    pub fn default_value() -> Self {
        Self {
            bam: "-".to_string(),
            out: "-".to_string(),
            nucleosome_length: NUC_LEN.parse().unwrap(),
            combined_nucleosome_length: COMBO_NUC_LEN.parse().unwrap(),
            min_distance_added: MIN_DIST_ADDED.parse().unwrap(),
            distance_from_end: DIST_FROM_END.parse().unwrap(),
            allowed_m6a_skips: ALLOWED_SKIPS.parse().unwrap(),
            min_ml_score: MIN_ML_SCORE.parse().unwrap(),
        }
    }
}

#[derive(Args)]
pub struct DecoratorOptions {
    /// Bam HiFi file with m6A calls
    #[clap(default_value = "-")]
    pub bam: String,
    /// Output path for bed12 file to be decorated
    #[clap(short, long)]
    pub bed12: String,
    /// Output path for decorator bed file
    #[clap(short, long, default_value = "-")]
    pub decorator: String,
    /// Minium score in the ML tag to include in the output
    #[clap(short, long, default_value = MIN_ML_SCORE)]
    pub min_ml_score: u8,
}

#[derive(Args, Debug, PartialEq, Eq)]
pub struct FootprintOptions {
    /// Indexed and aligned bam file with m6A and MSP calls
    pub bam: String,
    /// BED file with the regions to footprint. Should all contain the same motif with proper strand information, and ideally be ChIP-seq peaks.
    pub bed: String,
    /// yaml describing the modules of the footprint
    pub yaml: String,
    /// Output bam
    #[clap(short, long, default_value = "-")]
    pub out: String,
}

#[derive(Args, Debug, PartialEq, Eq)]
pub struct CenterOptions {
    /// Aligned Fiber-seq bam file.
    pub bam: String,
    /// Bed file on which to center fiberseq reads. Data is adjusted to the start position of the bed file and corrected for strand if the strand is indicated in the 6th column of the bed file. The 4th column will also be checked for the strand but only after the 6th is.        
    /// If you include strand information in the 4th (or 6th) column it will orient data accordingly and use the end position of bed record instead of the start if on the minus strand. This means that profiles of motifs in both the forward and minus orientation will align to the same central position.
    pub bed: String,
    /// Minium score in the ML tag to include in the output
    #[clap(short, long, default_value = MIN_ML_SCORE)]
    pub min_ml_score: u8,
    /// Set a maximum distance from the start of the motif to keep a feature
    #[clap(short, long)]
    pub dist: Option<i64>,
    /// Provide data in wide format, one row per read
    #[clap(short, long)]
    pub wide: bool,
    /// Return relative reference position instead of relative molecular position
    #[clap(short, long)]
    pub reference: bool,
    /// Replace the sequence output column with just "N".
    #[clap(short, long)]
    pub simplify: bool,
}

#[derive(Args, Debug, PartialEq, Eq)]
pub struct ExtractOptions {
    /// Input fiberseq bam file. If no path is provided extract will read bam data from stdin.
    #[arg(default_value = "-", value_hint = ValueHint::AnyPath)]
    pub bam: String,
    /// Report in reference sequence coordinates
    #[clap(short, long, default_value = "true",
          default_value_ifs([
              ("molecular", "true", "false"),
              ("molecular", "false", "true"),
          ]))
      ]
    pub reference: bool,
    /// Report positions in the molecular sequence coordinates
    #[clap(long, default_value = "false")]
    pub molecular: bool,
    /// Minium score in the ML tag to include in the output
    #[clap(short, long, default_value = MIN_ML_SCORE)]
    pub min_ml_score: u8,
    /// Output path for m6a bed12
    #[clap(long)]
    pub m6a: Option<String>,
    /// Output path for 5mC (CpG, primrose) bed12
    #[clap(short, long)]
    pub cpg: Option<String>,
    /// Output path for methylation sensitive patch (msp) bed12
    #[clap(long)]
    pub msp: Option<String>,
    /// Output path for nucleosome bed12
    #[clap(short, long)]
    pub nuc: Option<String>,
    /// Output path for a tabular format including "all" fiberseq information in the bam
    #[clap(short, long)]
    pub all: Option<String>,
    /// Include per base quality scores in "fiber_qual"
    #[clap(short, long, help_heading = "All-Format-Options")]
    pub quality: bool,
    /// Simplify output by remove fiber sequence
    #[clap(short, long, help_heading = "All-Format-Options")]
    pub simplify: bool,
}