1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
use clap::{Parser, Subcommand, ValueEnum};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
#[derive(Debug, Parser)]
#[command(name = "jam")]
#[command(bin_name = "jam")]
#[command(version = "0.1.0-beta.1")]
#[command(
about = "Just another (genomic) minhasher (jam), obviously blazingly fast",
long_about = "An optimized minhash implementation that focuses on quick scans for small sequences in large datasets."
)]
pub struct Cli {
#[command(subcommand)]
pub command: Commands,
/// Number of threads to use
#[arg(short, long, global = true, default_value = "1")]
pub threads: Option<usize>,
/// Overwrite output files
#[arg(short, long, global = true, default_value = "false")]
pub force: bool,
}
#[derive(ValueEnum, Debug, Clone)]
pub enum OutputFormats {
Bin,
// Sourmash compatible json
Sourmash,
}
#[derive(ValueEnum, Debug, Clone, Deserialize, Serialize)]
pub enum HashAlgorithms {
Default, // AHash < 32 | Xxhash >= 32
Ahash,
Xxhash,
Murmur3,
}
#[derive(Debug, Subcommand, Clone)]
pub enum Commands {
/// Sketch one or more files and write the result to an output file (or stdout)
#[command(arg_required_else_help = true)]
Sketch {
/// Input file(s), one directory or one file with list of files to be hashed
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
input: Vec<PathBuf>,
/// Output file
#[arg(short, long)]
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
output: Option<PathBuf>,
/// kmer size, all sketches must have the same size to be compared
#[arg(short = 'k', long = "kmer-size", default_value = "21")]
kmer_size: u8,
/// Scale the hash space to a minimum fraction of the maximum hash value (FracMinHash)
#[arg(long)]
fscale: Option<u64>,
/// Scale the hash space to a minimum fraction of all k-mers (SizeMinHash)
#[arg(long)]
kscale: Option<u64>,
/// Minimum number of k-mers (per record) to be hashed, bottom cut-off
#[arg(long)]
nmin: Option<u64>,
/// Maximum number of k-mers (per record) to be hashed, top cut-off
#[arg(long)]
nmax: Option<u64>,
/// Change to other output formats
#[arg(long, default_value = "bin")]
format: OutputFormats,
/// Change the hashing algorithm
#[arg(long, default_value = "default")]
algorithm: HashAlgorithms,
/// Create a separate sketch for each sequence record
#[arg(long)]
singleton: bool,
},
/// Merge multiple input sketches into a single sketch
#[command(arg_required_else_help = true)]
Merge {
/// One or more input sketches
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
inputs: Vec<PathBuf>,
/// Output file
#[arg(short, long, required = true)]
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
output: PathBuf,
},
/// Estimate containment of a (small) sketch against a subset of one or more sketches as database.
/// Requires all sketches to have the same kmer size
#[command(arg_required_else_help = true)]
Dist {
/// Input sketch or raw file
#[arg(short, long)]
input: PathBuf,
/// Database sketch(es)
#[arg(short, long)]
database: Vec<PathBuf>,
/// Output to file instead of stdout
#[arg(short, long)]
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
output: Option<PathBuf>,
/// Cut-off value for similarity
#[arg(short, long, default_value = "0.0")]
cutoff: f64,
/// Use the Stats params for restricting results
#[arg(long)]
stats: bool,
/// Use GC stats with an upper bound of x% (gc_lower and gc_upper must be set)
#[arg(long)]
gc_lower: Option<u8>,
/// Use GC stats with an lower bound of y% (gc_lower and gc_upper must be set)
#[arg(long)]
gc_upper: Option<u8>,
},
}