1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
use clap::IntoApp;
use clap::{AppSettings, Parser, Subcommand};
#[derive(Parser, Debug)]
#[clap(
author,
version,
about,
propagate_version = true,
subcommand_required = true,
infer_subcommands = true,
arg_required_else_help = true,
help_expected = true
)]
#[clap(global_setting(AppSettings::DeriveDisplayOrder))]
pub struct Cli {
/// Threads for decompression.
#[clap(short, long, default_value_t = 8)]
pub threads: usize,
/// Logging level [-v: Info, -vv: Debug, -vvv: Trace].
#[clap(short, long, parse(from_occurrences), help_heading = "DEBUG")]
pub verbose: usize,
#[clap(subcommand)]
pub command: Option<Commands>,
}
///
/// This structure contains all the subcommands for rustybam and their help descriptions.
///
/// Because of naming conventions for rust enums the commands names have
/// different capitalization than on the command line.
/// For example, the `Liftover` enum is invoked using `rustybam liftover`
/// and the `TrimPaf` command with `rustybam trim-paf`.
///
#[derive(Subcommand, Debug)]
pub enum Commands {
/// Get percent identity stats from a sam/bam/cram or PAF.
/// Requires =/X operations in the CIGAR string!
///
/// ## output column descriptions:
/// ### perID_by_matches is calculated as:
/// `matches / (matches + mismatches)`
/// ### perID_by_events is calculated as:
/// `matches / (matches + mismatches + insertion events + deletion events)`
/// ### perID_by_all is calculated as:
/// `matches / (matches + mismatches + insertion bases + deletion bases)`
Stats {
/// Input sam/bam/cram/file.
#[clap(default_value = "-")]
bam: String,
/// Print query coordinates first
#[clap(short, long)]
qbed: bool,
/// Specify that the input is paf format,
/// (must have cg tag with extended cigar).
#[clap(short, long)]
paf: bool,
},
/// Count the number of bases in a bed file.
#[clap(visible_aliases = &["bedlen", "bl", "bedlength"])]
BedLength {
/// Input bed file.
#[clap(default_value = "-")]
bed: String,
/// Make the output human readable (Mbp).
#[clap(short, long)]
readable: bool,
/// Count bases for each category in this column <COLUMN>.
#[clap(short, long)]
column: Option<u8>,
},
/// Filter PAF records in various ways.
Filter {
/// PAF file from minimap2 or unimap. Must have the cg tag, and n matches will be zero unless the cigar uses =X.
#[clap(default_value = "-")]
paf: String,
/// Minimum number of aligned bases across all alignments between a target and query in order to keep those records.
#[clap(short, long, default_value_t = 0)]
paired_len: u64,
/// Minimum alignment length.
#[clap(short, long, default_value_t = 0)]
aln: u64,
/// Minimum query length.
#[clap(short, long, default_value_t = 0)]
query: u64,
},
/// Invert the target and query sequences in a PAF along with the CIGAR string.
Invert {
/// PAF file from minimap2 or unimap. Must have the cg tag, and n matches will be zero unless the cigar uses =X.
#[clap(default_value = "-")]
paf: String,
},
/// Liftover target sequence coordinates onto query sequence using a PAF.
///
/// This is a function for lifting over coordinates from a reference (<BED>) to a query using a PAF file from minimap2 or unimap (note, you can use `paftools.js sam2paf` to convert SAM data to PAF format).
/// The returned file is a PAF file that is trimmed to the regions in the bed file. Even the cigar in the returned PAF file is trimmed so it can be used downstream! Additionally, a tag with the format `id:Z:<>` is added to the PAF where `<>` is either the 4th column of the input bed file or if not present `chr_start_end`.
#[clap(visible_aliases=&["lo"], aliases = &["william-t-harvey", "wth"])]
Liftover {
/// PAF file from minimap2 or unimap run with -c and --eqx [i.e. the PAF file must have the cg tag and use extended CIGAR opts (=/X)].
#[clap(default_value = "-")]
paf: String,
/// BED file of reference regions to liftover to the query.
#[clap(short, long)]
bed: String,
/// Specifies that the BED file contains query coordinates to be lifted onto the reference (reverses direction of liftover).
///
/// Note, that this will make the query in the input `PAF` the target in the output `PAF`.
#[clap(short, long)]
qbed: bool,
/// If multiple records overlap the same region in the <bed> return only the largest liftover. The default is to return all liftovers.
#[clap(short, long)]
largest: bool,
},
/// Trim PAF records that overlap in query sequence to find and optimal splitting point using dynamic programing.
///
/// Note, this can be combined with `rb invert` to also trim the target sequence.
///
/// This idea is to mimic some of the trimming that happens in PAV to improve breakpoint detection. Starts with the largest overlap and iterates until no query overlaps remain.
#[clap(visible_aliases = &["trim", "tp"])]
TrimPaf {
/// PAF file from minimap2 or unimap. Must have the cg tag, and n matches will be zero unless the cigar uses =X.
#[clap(default_value = "-")]
paf: String,
/// Value added for a matching base.
#[clap(short, long, default_value_t = 1)]
match_score: i32,
/// Value subtracted for a mismatching base.
#[clap(short, long, default_value_t = 1)]
diff_score: i32,
/// Value subtracted for an insertion or deletion.
#[clap(short, long, default_value_t = 1)]
indel_score: i32,
/// Remove contained alignments as well as overlaps.
#[clap(short, long)]
remove_contained: bool,
},
/// Orient paf records so that most of the bases are in the forward direction.
///
/// Optionally scaffold the queriers so that there is one query per target.
Orient {
/// PAF file from minimap2 or unimap. Must have the cg tag, and n matches will be zero unless the cigar uses =X.
#[clap(default_value = "-")]
paf: String,
/// Generate ~1 query per target that scaffolds together all the records that map to that target sequence.
///
/// The order of the scaffold will be determined by the average target position across all the queries, and the name of the new scaffold will be.
#[clap(short, long)]
scaffold: bool,
/// Space to add between records.
#[clap(short, long, default_value_t = 1_000_000)]
insert: u64,
},
/// Break PAF records with large indels into multiple records (useful for SafFire).
#[clap(visible_aliases = &["breakpaf", "bp"])]
BreakPaf {
/// PAF file from minimap2 or unimap. Must have the cg tag, and n matches will be zero unless the cigar uses =X.
#[clap(default_value = "-")]
paf: String,
/// Maximum indel size to keep in the paf.
#[clap(short, long, default_value_t = 100)]
max_size: u32,
},
/// Convert a PAF file into a SAM file. Warning, all alignments will be marked as primary!
#[clap(visible_aliases = &["paftosam", "p2s", "paf2sam"])]
PafToSam {
/// PAF file from minimap2 or unimap. Must have a CIGAR tag.
#[clap(default_value = "-")]
paf: String,
/// Optional query fasta file (with index) to populate the query seq field.
#[clap(short, long)]
fasta: Option<String>,
},
/// Splits fastx from stdin into multiple files.
///
/// Specifically it reads fastx format (fastq, fasta, or mixed) from stdin and divides the records across multiple output files. Output files can be compressed by adding `.gz`, and the input can also be compressed or uncompressed.
#[clap(visible_aliases = &["fxs", "fasta-split", "fastq-split" ,"fa-split", "fq-split"])]
FastxSplit {
/// List of fastx files to write to.
fastx: Vec<String>,
},
/// Mimic bedtools getfasta but allow for bgzip in both bed and fasta inputs.
#[clap(visible_aliases = &["getfasta", "gf"])]
GetFasta {
/// Fasta file to extract sequences from.
#[clap(short, long, default_value = "-")]
fasta: String,
/// BED file of regions to extract.
#[clap(short, long)]
bed: String,
/// Reverse complement the sequence if the strand is "-".
#[clap(short, long)]
strand: bool,
/// Add the name (4th column) to the header of the fasta output.
#[clap(short, long)]
name: bool,
},
/// Get the frequencies of each bp at each position.
Nucfreq {
/// Input sam/bam/cram/file.
#[clap(default_value = "-")]
bam: String,
/// Print nucfreq info from the input region e.g "chr1:1-1000".
#[clap(short, long)]
region: Option<String>,
/// Print nucfreq info from regions in the bed file
/// output is optionally tagged using the 4th column.
#[clap(short, long)]
bed: Option<String>,
/// Smaller output format.
#[clap(short, long)]
small: bool,
},
/// Report the longest exact repeat length at every position in a fasta.
Repeat {
/// Input fasta file.
#[clap(default_value = "-")]
fasta: String,
/// The smallest repeat length to report.
#[clap(short, long, default_value_t = 21)]
min: usize,
},
/// Extract the intervals in a genome (fasta) that are made up of SUNs.
Suns {
/// Input fasta file with the genome.
#[clap(short, long, default_value = "-")]
fasta: String,
/// The size of the required unique kmer.
#[clap(short, long, default_value_t = 21)]
kmer_size: usize,
/// The maximum size SUN interval to report.
#[clap(short, long, default_value_t = std::usize::MAX)]
max_size: usize,
/// Confirm all the SUNs (very slow) only for debugging.
#[clap(short, long)]
validate: bool,
},
}
pub fn make_cli_parse() -> Cli {
Cli::parse()
}
pub fn make_cli_app() -> clap::Command<'static> {
Cli::command()
}