ska/
cli.rs

1//! Command line interface, built using [`crate::clap` with `Derive`](https://docs.rs/clap/latest/clap/_derive/_tutorial/index.html)
2use std::fmt;
3
4use super::QualFilter;
5use clap::{ArgGroup, Parser, Subcommand, ValueEnum};
6use std::path::PathBuf;
7
8/// Default split k-mer size
9pub const DEFAULT_KMER: usize = 31;
10/// Defualt maximum number of reads
11pub const DEFAULT_PROPORTION_READS: Option<f64> = None;
12/// Default single strand (which is equivalent to !rc)
13pub const DEFAULT_STRAND: bool = false;
14/// Default minimum frequency filter threshold
15pub const DEFAULT_MINFREQ: f64 = 0.9;
16/// Default behaviour when min-freq counting ambig sites
17pub const DEFAULT_AMBIGMISSING: bool = false;
18/// Default repeat masking behaviour
19pub const DEFAULT_REPEATMASK: bool = false;
20/// Default ambiguous masking behaviour
21pub const DEFAULT_AMBIGMASK: bool = false;
22/// Default gap ignoring behaviour (at constant sites)
23pub const DEFAULT_CONSTGAPS: bool = false;
24/// Default minimum k-mer count for FASTQ files
25pub const DEFAULT_MINCOUNT: u16 = 5;
26/// Default minimum base quality (PHRED score) for FASTQ files
27pub const DEFAULT_MINQUAL: u8 = 20;
28/// Default quality filtering criteria
29pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Strict;
30/// Default -m for ska lo
31pub const DEFAULT_MISSING_SKALO: f32 = 0.1;
32/// Default -d for ska lo
33pub const DEFAULT_MAX_PATHDEPTH: usize = 4;
34/// Deafult -n for ska lo
35pub const DEFAULT_MAX_INDEL_KMERS: usize = 2;
36
37#[doc(hidden)]
38fn valid_kmer(s: &str) -> Result<usize, String> {
39    let k: usize = s
40        .parse()
41        .map_err(|_| format!("`{s}` isn't a valid k-mer"))?;
42    if !(5..=63).contains(&k) || k.is_multiple_of(2) {
43        Err("K-mer must be an odd number between 5 and 63 (inclusive)".to_string())
44    } else {
45        Ok(k)
46    }
47}
48
49#[doc(hidden)]
50fn valid_proportion(s: &str) -> Result<f64, String> {
51    let p: f64 = s
52        .parse()
53        .map_err(|_| format!("`{s}` isn't a valid proportion"))?;
54    if !(0.0..=1.0).contains(&p) {
55        Err("K-mer must be between 0 and 1 (inclusive)".to_string())
56    } else {
57        Ok(p)
58    }
59}
60
61#[doc(hidden)]
62fn zero_to_one(s: &str) -> Result<f64, String> {
63    let f: f64 = s
64        .parse()
65        .map_err(|_| format!("`{s}` isn't a valid frequency"))?;
66    if !(0.0..=1.0).contains(&f) {
67        Err("Frequency must be between 0 and 1 (inclusive)".to_string())
68    } else {
69        Ok(f)
70    }
71}
72
73#[doc(hidden)]
74fn valid_cpus(s: &str) -> Result<usize, String> {
75    let threads: usize = s
76        .parse()
77        .map_err(|_| format!("`{s}` isn't a valid number of cores"))?;
78    if threads < 1 {
79        Err("Threads must be one or higher".to_string())
80    } else {
81        Ok(threads)
82    }
83}
84
85/// Prints a warning if more threads than available have been requested
86pub fn check_threads(threads: usize) {
87    let max_threads = num_cpus::get();
88    if threads > max_threads {
89        log::warn!("{threads} threads is greater than available cores {max_threads}");
90    }
91}
92
93#[doc(hidden)]
94pub fn valid_min_kmer(s: &str) -> Result<ValidMinKmer, String> {
95    match s {
96        s if s.eq(&String::from("auto")) => Ok(ValidMinKmer::Auto),
97        s => {
98            // Throw error if it cannot be parsed to u16
99            let x: u16 = s.parse().expect("Invalid minimum kmer count");
100            if x.ge(&1) {
101                log::info!("Using provided minimum kmer count of {x}");
102                Ok(ValidMinKmer::Val(x))
103            } else {
104                Err("Minimum kmer count must be >= 1".to_string())
105            }
106        }
107    }
108}
109/// Possible user input for minimum kmer count threshold
110#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
111pub enum ValidMinKmer {
112    /// Attempt to calculate using cov command
113    Auto,
114    /// User provided u16 value for threshold
115    Val(u16),
116}
117
118/// Possible output file types
119#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
120pub enum FileType {
121    /// Variant call format
122    Vcf,
123    /// FASTA alignment
124    Aln,
125}
126
127/// Possible variant filters
128#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
129pub enum FilterType {
130    /// Output all variants
131    NoFilter,
132    /// Filter constant bases
133    NoConst,
134    /// Filter any site with an ambiguous base
135    NoAmbig,
136    /// Filter constant bases, and any ambiguous bases
137    NoAmbigOrConst,
138}
139
140/// As text, for use in logging messages
141impl fmt::Display for FilterType {
142    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
143        match *self {
144            Self::NoFilter => write!(f, "No filtering"),
145            Self::NoConst => write!(f, "No constant sites"),
146            Self::NoAmbig => write!(f, "No ambiguous sites"),
147            Self::NoAmbigOrConst => write!(f, "No constant sites or ambiguous bases"),
148        }
149    }
150}
151
152/// Options that apply to all subcommands
153#[derive(Parser)]
154#[command(author, version, about, long_about = None)]
155#[command(propagate_version = true)]
156pub struct Args {
157    #[doc(hidden)]
158    #[command(subcommand)]
159    pub command: Commands,
160
161    /// Show progress messages
162    #[arg(short, long, global = true)]
163    pub verbose: bool,
164}
165
166/// Subcommands and their specific options
167#[derive(Subcommand)]
168pub enum Commands {
169    #[command(group(
170        ArgGroup::new("input")
171            .required(true)
172            .args(["seq_files", "file_list"]),
173    ))]
174    /// Create a split-kmer file from input sequences
175    Build {
176        /// List of input FASTA files
177        #[arg(group = "input")]
178        seq_files: Option<Vec<String>>,
179
180        /// File listing input files (tab separated name, sequences)
181        #[arg(short, group = "input")]
182        file_list: Option<String>,
183
184        /// Output prefix
185        #[arg(short)]
186        output: String,
187
188        /// K-mer size
189        #[arg(short, value_parser = valid_kmer, default_value_t = DEFAULT_KMER)]
190        k: usize,
191
192        /// Number of reads before stopping
193        #[arg(long, value_parser = valid_proportion)]
194        proportion_reads: Option<f64>,
195
196        /// Ignore reverse complement (all contigs are oriented along same strand)
197        #[arg(long, default_value_t = DEFAULT_STRAND)]
198        single_strand: bool,
199
200        /// Minimum k-mer count (with reads)
201        #[arg(long, value_parser = valid_min_kmer)]
202        min_count: Option<ValidMinKmer>,
203
204        /// Minimum k-mer quality (with reads)
205        #[arg(long, default_value_t = DEFAULT_MINQUAL)]
206        min_qual: u8,
207
208        /// Quality filtering criteria (with reads)
209        #[arg(long, value_enum, default_value_t = DEFAULT_QUALFILTER)]
210        qual_filter: QualFilter,
211
212        /// Number of CPU threads
213        #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
214        threads: usize,
215    },
216    /// Write an unordered alignment
217    Align {
218        /// A .skf file, or list of .fasta files
219        #[arg(required = true)]
220        input: Vec<String>,
221
222        /// Output filename (omit to output to stdout)
223        #[arg(short)]
224        output: Option<String>,
225
226        /// Minimum fraction of samples a k-mer has to appear in
227        #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
228        min_freq: f64,
229
230        /// With min_freq, only count non-ambiguous sites
231        #[arg(long, default_value_t = DEFAULT_AMBIGMISSING)]
232        filter_ambig_as_missing: bool,
233
234        /// Filter for constant middle base sites
235        #[arg(long, value_enum, default_value_t = FilterType::NoConst)]
236        filter: FilterType,
237
238        /// Mask any ambiguous bases in the alignment with 'N'
239        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
240        ambig_mask: bool,
241
242        /// Ignore gaps '-' in constant sites (for low coverage samples)
243        #[arg(long, default_value_t = DEFAULT_CONSTGAPS)]
244        no_gap_only_sites: bool,
245
246        /// Number of CPU threads
247        #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
248        threads: usize,
249    },
250    /// Write an ordered alignment using a reference sequence
251    Map {
252        /// Reference FASTA file to map to
253        reference: String,
254
255        /// A .skf file, or list of .fasta files
256        input: Vec<String>,
257
258        /// Output filename (omit to output to stdout)
259        #[arg(short)]
260        output: Option<String>,
261
262        /// Format of output file
263        #[arg(short, long, value_enum, default_value_t = FileType::Aln)]
264        format: FileType,
265
266        /// Mask any ambiguous bases in the alignment with 'N'
267        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
268        ambig_mask: bool,
269
270        /// Mask any repeats in the alignment with 'N'
271        #[arg(long, default_value_t = DEFAULT_REPEATMASK)]
272        repeat_mask: bool,
273
274        /// Number of CPU threads
275        #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
276        threads: usize,
277    },
278    /// Calculate SNP distances and k-mer mismatches
279    Distance {
280        /// Split-kmer (.skf) file to operate on
281        skf_file: String,
282
283        /// Output filename (omit to output to stdout)
284        #[arg(short)]
285        output: Option<String>,
286
287        /// Minimum fraction of samples a k-mer has to appear in
288        /// across the entire alignment
289        #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
290        min_freq: f64,
291
292        /// Don't filter out ambiguous bases and compute fractional distances
293        #[arg(long, default_value_t = false)]
294        allow_ambiguous: bool,
295
296        /// Number of CPU threads
297        #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
298        threads: usize,
299    },
300    /// Combine multiple split k-mer files
301    Merge {
302        /// List of input split-kmer (.skf) files
303        skf_files: Vec<String>,
304
305        /// Output prefix
306        #[arg(short)]
307        output: String,
308    },
309    #[command(group(
310        ArgGroup::new("input")
311            .required(true)
312            .args(["names", "file_list"]),
313    ))]
314    /// Remove samples from a split k-mer file
315    Delete {
316        /// Split-kmer (.skf) file to operate on
317        #[arg(short, long, required = true)]
318        skf_file: String,
319
320        /// Output name. If not provided, will overwrite the input file
321        #[arg(short)]
322        output: Option<String>,
323
324        /// File listing sample names to remove
325        #[arg(short, group = "input")]
326        file_list: Option<String>,
327
328        /// List of sample names to remove
329        #[arg(group = "input")]
330        names: Option<Vec<String>>,
331    },
332    /// Remove k-mers from a split k-mer file
333    Weed {
334        /// Split-kmer (.skf) file to operate on
335        skf_file: String,
336
337        /// A FASTA file containing sequences to remove
338        weed_file: Option<String>,
339
340        /// Output filename (omit to overwrite input file)
341        #[arg(short)]
342        output: Option<String>,
343
344        /// Remove k-mers not in the weed_file
345        #[arg(long, default_value_t = false)]
346        reverse: bool,
347
348        /// Minimum fraction of samples a k-mer has to appear in
349        #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
350        min_freq: f64,
351
352        /// With min_freq, only count non-ambiguous sites
353        #[arg(long, default_value_t = DEFAULT_AMBIGMISSING)]
354        filter_ambig_as_missing: bool,
355
356        /// Filter for constant middle base sites
357        #[arg(long, value_enum, default_value_t = FilterType::NoFilter)]
358        filter: FilterType,
359
360        /// Mask any ambiguous bases in the alignment with 'N'
361        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
362        ambig_mask: bool,
363
364        /// Ignore gaps '-' in constant sites
365        #[arg(long, default_value_t = DEFAULT_CONSTGAPS)]
366        no_gap_only_sites: bool,
367    },
368    /// Get the number of k-mers in a split k-mer file, and other information
369    Nk {
370        /// Split-kmer (.skf) file to operate on
371        skf_file: String,
372
373        /// Also write out split-kmers, and middle base matrix
374        #[arg(long, default_value_t = false)]
375        full_info: bool,
376    },
377    /// Estimate a coverage cutoff using a k-mer count profile (FASTQ only)
378    Cov {
379        /// FASTQ file (or .fastq.gz) with forward reads
380        fastq_fwd: String,
381
382        /// FASTQ file (or .fastq.gz) with reverse reads
383        fastq_rev: String,
384
385        /// K-mer size
386        #[arg(short, value_parser = valid_kmer, default_value_t = DEFAULT_KMER)]
387        k: usize,
388
389        /// Ignore reverse complement (all reads are oriented along same strand)
390        #[arg(long, default_value_t = DEFAULT_STRAND)]
391        single_strand: bool,
392    },
393    /// Finds 'left out' SNPs and INDELs using a graph
394    Lo {
395        /// input SKA2 file
396        input_skf: String,
397
398        /// prefix of output files
399        output: String,
400
401        /// reference genome for SNP positioning
402        #[arg(short = 'r', long, help_heading = "input")]
403        reference: Option<PathBuf>,
404
405        /// maximum fraction of missing data
406        #[arg(short = 'm', long, default_value_t = DEFAULT_MISSING_SKALO, help_heading = "output")]
407        missing: f32,
408
409        /// maximum depth of recursive paths
410        #[arg(
411            short = 'd',
412            long,
413            default_value_t = DEFAULT_MAX_PATHDEPTH,
414            help_heading = "graph traversal"
415        )]
416        depth: usize,
417
418        /// maximum number of internal indel k-mers
419        #[arg(short = 'n', long, default_value_t = DEFAULT_MAX_INDEL_KMERS, help_heading = "other")]
420        indel_kmers: usize,
421
422        /// Number of CPU threads
423        #[arg(long, value_parser = valid_cpus, default_value_t = 1, help_heading = "other")]
424        threads: usize,
425    },
426}
427
428/// Function to parse command line args into [`Args`] struct
429pub fn cli_args() -> Args {
430    Args::parse()
431}