nail 0.5.0

nail is an alignment inference tool
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
use std::{io::Write, path::PathBuf, str::FromStr};

use anyhow::bail;
use clap::{Args, Parser, Subcommand};

use crate::util::{term::*, PathExt};

#[derive(Subcommand)]
#[allow(clippy::large_enum_variant)]
pub enum NailSubCommands {
    #[command(about = "Run nail's protein search pipeline")]
    Search(SearchArgs),
    #[command(subcommand, hide = true)]
    Dev(DevSubCommands),
}

#[derive(Subcommand)]
#[allow(clippy::large_enum_variant)]
pub enum DevSubCommands {
    #[command()]
    Play(SearchArgs),
    Search(SearchArgs),
    Mx(SearchArgs),
}

#[derive(Parser)]
#[command(version)]
#[command(name = "nail")]
#[command(
    about = "Using MMseqs2 to find rough alignment seeds, perform bounded profile HMM sequence alignment"
)]
pub struct NailCli {
    #[command(subcommand)]
    pub command: NailSubCommands,
}

#[derive(Debug, Args)]
pub struct SearchArgs {
    /// The query database file
    #[arg(value_name = "QUERY.[fasta:hmm]")]
    pub query_path: PathBuf,

    /// The target database file
    #[arg(value_name = "TARGET.fasta")]
    pub target_path: PathBuf,

    /// The number of threads that nail will use
    #[arg(short = 't', default_value_t = 8usize, value_name = "N")]
    pub num_threads: usize,

    /// Print out pipeline summary statistics
    #[arg(short = 's', action)]
    pub print_summary_stats: bool,

    /// Don't write any tabular results, write alignments to stdout
    #[arg(short = 'x', action)]
    pub ali_to_stdout: bool,

    #[command(flatten)]
    #[clap(next_help_heading = "File I/O options")]
    pub io_args: IoArgs,

    #[command(flatten)]
    #[clap(next_help_heading = "Pipeline options")]
    pub pipeline_args: PipelineArgs,

    /// Arguments that are passed to MMseqs2
    #[command(flatten)]
    #[clap(next_help_heading = "Seeding options")]
    pub mmseqs_args: MmseqsArgs,

    #[command(flatten)]
    #[clap(next_help_heading = "Expert options")]
    pub expert_args: ExpertArgs,

    #[command(flatten)]
    #[clap(next_help_heading = "Dev options")]
    pub dev_args: DevArgs,
}

impl SearchArgs {
    pub fn validate(&mut self) -> anyhow::Result<()> {
        if let Some(p1) = self.dev_args.bit_p {
            let p2 = *libnail::output::output_tabular::BIT_P.get_or_init(|| p1);
            if p1 != p2 {
                bail!("failed to set BIT_P")
            }
        }

        if let Some(p1) = self.dev_args.f32_p {
            let p2 = *libnail::output::output_tabular::F32_P.get_or_init(|| p1);
            if p1 != p2 {
                bail!("failed to set F32_P")
            }
        }

        if let Some(p1) = self.dev_args.f64_p {
            let p2 = *libnail::output::output_tabular::F64_P.get_or_init(|| p1);
            if p1 != p2 {
                bail!("failed to set F64_P")
            }
        }

        {
            // quickly make sure we can write to all of the results paths
            self.io_args.temp_dir_path.create_dir()?;

            if let Some(path) = &self.io_args.tbl_results_path {
                path.check_open(self.io_args.allow_overwrite)?;
            }

            if let Some(path) = &self.io_args.ali_results_path {
                path.check_open(self.io_args.allow_overwrite)?;
            }

            if let Some(path) = &self.io_args.seeds_output_path {
                path.check_open(self.io_args.allow_overwrite)?;
            }

            if let Some(path) = &self.dev_args.stats_results_path {
                path.check_open(self.io_args.allow_overwrite)?;
            }
        }

        if self.ali_to_stdout {
            self.io_args.tbl_results_path = None
        }

        if self.pipeline_args.only_seed && self.io_args.seeds_output_path.is_none() {
            self.io_args.seeds_output_path = Some(PathBuf::from_str("./seeds.tsv")?);
        }

        if self.mmseqs_args.prog_seed {
            if self.mmseqs_args.max_seqs != 2_147_483_647 {
                bail!(
                    "the argument '{YELLOW}--mmseqs-max-seqs{RESET}' is set wrong: {}",
                    self.mmseqs_args.max_seqs
                )
            }

            if self.mmseqs_args.prog_n.is_none() {
                bail!("the argument '{YELLOW}--prog-n{RESET}' is unset")
            }

            if self.mmseqs_args.prog_n.is_none() {
                bail!("the argument '{YELLOW}--prog-f{RESET}' is unset")
            }
        } else {
            #[allow(clippy::collapsible_if)]
            if self.mmseqs_args.prog_n.is_some() {
                bail!("the argument '{YELLOW}--prog-n{RESET}' cannot be used without '{YELLOW}--prog-seed{RESET}'")
            }
            if self.mmseqs_args.prog_n.is_some() {
                bail!("the argument '{YELLOW}--prog-f{RESET}' cannot be used without '{YELLOW}--prog-seed{RESET}'")
            }
        }

        Ok(())
    }

    pub fn write(&self, out: &mut impl Write) -> anyhow::Result<()> {
        writeln!(
            out,
            "target: {}",
            self.target_path.to_str().unwrap_or_default()
        )?;
        writeln!(
            out,
            "query:  {}",
            self.query_path.to_str().unwrap_or_default()
        )?;
        writeln!(out)?;

        writeln!(out, "pipeline arguments:")?;
        writeln!(out, " ├─ mmseqs -k: {}", self.mmseqs_args.k)?;
        writeln!(out, " ├─ mmseqs -s: {:.}", self.mmseqs_args.s)?;
        writeln!(
            out,
            " ├─ mmseqs --max-seqs: {:.}",
            self.mmseqs_args.max_seqs
        )?;
        writeln!(out, " ├─ prog-seed: {}", self.mmseqs_args.prog_seed)?;
        if self.mmseqs_args.prog_seed {
            writeln!(
                out,
                "   ├─ n: {}",
                self.mmseqs_args.prog_n.unwrap_or_default()
            )?;
            writeln!(
                out,
                "   └─ f: {}",
                self.mmseqs_args.prog_f.unwrap_or_default()
            )?;
        }
        writeln!(out, " ├─ α: {}", self.pipeline_args.alpha)?;
        writeln!(out, " ├─ β: {}", self.pipeline_args.beta)?;
        writeln!(out, " ├─ γ: {}", self.pipeline_args.gamma)?;
        writeln!(out, " ├─ S: {}", self.pipeline_args.seed_pvalue_threshold)?;
        writeln!(out, " ├─ C: {}", self.pipeline_args.cloud_pvalue_threshold)?;
        writeln!(
            out,
            " ├─ F: {}",
            self.pipeline_args.forward_pvalue_threshold
        )?;
        writeln!(out, " ├─ E: {}", self.pipeline_args.e_value_threshold)?;
        writeln!(
            out,
            " └─ Z: {}",
            self.expert_args.target_database_size.unwrap_or_default()
        )?;

        Ok(())
    }
}

#[derive(Args, Debug, Clone, Default)]
pub struct IoArgs {
    /// The file where tabular output will be written
    #[arg(long = "tbl-out", default_value = "results.tbl", value_name = "PATH")]
    pub tbl_results_path: Option<PathBuf>,

    /// The file where alignment output will be written
    #[arg(long = "ali-out", default_value = None, value_name = "PATH")]
    pub ali_results_path: Option<PathBuf>,

    /// A file containing pre-computed alignment seeds
    #[arg(long = "seeds", value_name = "PATH")]
    pub seeds_input_path: Option<PathBuf>,

    /// The file where alignment seeds will be written
    #[arg(long = "seeds-out", default_value = None, value_name = "PATH")]
    pub seeds_output_path: Option<PathBuf>,

    /// The directory where intermediate files will be placed
    #[arg(long = "tmp-dir", default_value = "tmp-nail/", value_name = "PATH")]
    pub temp_dir_path: PathBuf,

    /// Allow nail to overwrite files
    #[arg(long = "allow-overwrite", default_value_t = false)]
    pub allow_overwrite: bool,
}

#[derive(Args, Debug, Clone, Default)]
pub struct PipelineArgs {
    /// Pruning parameter alpha
    #[arg(
        short = 'A',
        default_value_t = 10.0,
        value_name = "X",
        help = "Cloud search parameter α:\n  \
                local score pruning threshold"
    )]
    pub alpha: f32,

    /// Pruning parameter beta
    #[arg(
        short = 'B',
        default_value_t = 16.0,
        value_name = "X",
        help = "Cloud search parameter β:\n  \
                global score pruning threshold"
    )]
    pub beta: f32,

    /// Pruning parameter gamma
    #[arg(
        short = 'G',
        default_value_t = 5,
        value_name = "N",
        help = "Cloud search parameter γ:\n  \
                at minimum, compute N anti-diagonals"
    )]
    pub gamma: usize,

    /// Seeding filter threshold
    #[arg(
        short = 'S',
        default_value_t = 1e-4,
        value_name = "X",
        help = "Seeding filter threshold:\n  \
                filter hits with P-value > X"
    )]
    pub seed_pvalue_threshold: f64,

    /// Cloud search filter threshold
    #[arg(
        short = 'C',
        default_value_t = 1e-2,
        value_name = "X",
        help = "Cloud search threshold:\n  \
                filter hits with P-value > X"
    )]
    pub cloud_pvalue_threshold: f64,

    /// Forward filter threshold
    #[arg(
        short = 'F',
        default_value_t = 1e-4,
        value_name = "X",
        help = "Forward filter threshold:\n  \
                filter hits with P-value > X"
    )]
    pub forward_pvalue_threshold: f64,

    /// Final E-value threshold
    #[arg(
        short = 'E',
        default_value_t = 10.0,
        value_name = "X",
        help = "Final reporting threshold:\n  \
                filter hits with E-value > X"
    )]
    pub e_value_threshold: f64,

    /// Produce alignment seeds and terminate
    #[arg(long = "only-seed", action)]
    pub only_seed: bool,
}

#[derive(Args, Debug, Clone, Default)]
pub struct ExpertArgs {
    /// Override the number of comparisons used for E-value calculation
    #[arg(short = 'Z', value_name = "N")]
    pub target_database_size: Option<usize>,

    /// Don't compute sequence composition bias score correction
    #[arg(long = "no-null2", action)]
    pub no_null_two: bool,
}

#[derive(Args, Debug, Clone, Default)]
pub struct DevArgs {
    /// Where to place stats output
    #[arg(long, value_name = "PATH", hide = true)]
    pub stats_results_path: Option<PathBuf>,

    /// Compute the full DP matrices
    #[arg(long, action, hide = true)]
    pub full_dp: bool,

    /// Formatting precision for Bits
    #[arg(long, hide = true)]
    pub bit_p: Option<usize>,

    /// Formatting precision for f32
    #[arg(long, hide = true)]
    pub f32_p: Option<usize>,

    /// Formatting precision for f64
    #[arg(long, hide = true)]
    pub f64_p: Option<usize>,
}

#[derive(Args, Debug, Clone, Default)]
pub struct MmseqsArgs {
    /// MMseqs2 Parameter: k-mer length (0: automatically set to optimum)
    #[arg(
        long = "mmseqs-k",
        default_value_t = 6usize,
        value_name = "N",
        help = "MMseqs2 parameter:\n  \
                k-mer length (0: automatically set to optimum)"
    )]
    pub k: usize,

    /// MMseqs2 Parameter: Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive
    #[arg(
        long = "mmseqs-s",
        default_value_t = 10.0,
        value_name = "X",
        help = "MMseqs2 parameter:\n  \
                Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive"
    )]
    pub s: f32,

    /// MMseqs2 Parameter: Maximum results per query sequence allowed to pass the prefilter
    #[arg(
        long = "mmseqs-max-seqs",
        default_value_t = 200usize,
        default_value_if("prog_seed", "true", "2147483647"),
        value_name = "N",
        help = "MMseqs2 parameter:\n  \
                Maximum results per query sequence allowed to pass the prefilter"
    )]
    pub max_seqs: usize,

    /// MMseqs2 Parameter: Correct for locally biased amino acid composition (range 0-1)
    #[arg(
        long = "mmseqs-comp-bias-corr",
        default_value = None,
        value_name = "N",
        hide = true,
        help = "MMseqs2 parameter:\n  \
                Correct for locally biased amino acid composition (range 0-1)"
    )]
    pub comp_bias_corr: Option<usize>,

    /// Enable progressive seeding
    #[arg(long, action, conflicts_with = "max_seqs")]
    pub prog_seed: bool,

    /// The initial number of mmseqs alignments per query
    #[arg(
        long,
        default_value = "200",
        default_value_if("prog_seed", "true", "200"),
        default_value_if("prog_seed", "false", None),
        value_name = "N",
        conflicts_with = "max_seqs",
        help = "Progressive seeding:\n  \
                the initial number of mmseqs alignments per query"
    )]
    pub prog_n: Option<usize>,

    /// test
    #[arg(
        long,
        default_value = "0.01",
        default_value_if("prog_seed", "true", "0.01"),
        default_value_if("prog_seed", "false", None),
        value_name = "X",
        conflicts_with = "max_seqs",
        help = "Progressive seeding:\n  \
                the fraction of hits required to continue progressive seeding"
    )]
    pub prog_f: Option<f32>,
}