1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
use clap::{Parser, Subcommand};
use std::path::PathBuf;
#[derive(Debug, Parser)]
#[command(name = "jam")]
#[command(bin_name = "jam")]
#[command(version = "0.9.10")]
#[command(
about = "Just another (genomic) minhasher (jam), obviously blazingly fast",
long_about = "An optimized minhash implementation that focuses on quick scans for small sequences in large datasets."
)]
pub struct Cli {
#[command(subcommand)]
pub command: Commands,
/// Number of threads to use
#[arg(short, long, global = true, default_value = "1")]
pub threads: Option<usize>,
/// Overwrite output files
#[arg(short, long, global = true, default_value = "false")]
pub force: bool,
/// Silent mode, no (additional) output to stdout
/// Only errors and output files will be printed
#[arg(short, long, global = true, default_value = "false")]
pub silent: bool,
/// Maximum memory usage in bytes in GB
#[arg(short, long, global = true, default_value = "2")]
pub memory: Option<usize>,
}
#[derive(Debug, Subcommand, Clone)]
pub enum Commands {
/// Sketch one or more files and write the result to an output file
#[command(arg_required_else_help = true)]
Sketch {
/// Input file(s), directories, or file with list of files to be hashed
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
input: Vec<PathBuf>,
/// Output file (.jam format)
#[arg(short, long)]
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
output: PathBuf,
/// K-mer size, all sketches must have the same size to be compared and below 32
#[arg(short = 'k', long = "kmer-size", default_value = "21")]
kmer_size: u8,
/// Scale the hash space to a minimum fraction of the maximum hash value (FracMinHash)
#[arg(long)]
fscale: Option<u64>,
/// Complexity cut-off, only hash sequences with complexity above this value
/// This is created via shannon entropy
#[arg(long, default_value = "0.0")]
complexity: f64,
/// Create a separate sketch for each sequence record
/// Will increase the size of the output file
#[arg(long)]
singleton: bool,
/// Custom temporary directory for intermediate files during sorting
#[arg(long)]
temp_dir: Option<PathBuf>,
/// Path to a bias table file (.bias) for hash-based filtering
#[arg(long)]
bias_table: Option<PathBuf>,
},
/// Estimate containment of a query sequence against a sketch database.
/// Requires all sketches to have the same kmer size
#[command(arg_required_else_help = true)]
Dist {
/// Input FASTA/FASTQ file to query
#[arg(short, long)]
input: PathBuf,
/// Database sketch (.jam file)
#[arg(short, long)]
database: PathBuf,
/// Output to file instead of stdout
#[arg(short, long)]
#[arg(value_parser = clap::value_parser!(std::path::PathBuf))]
output: Option<PathBuf>,
/// Cut-off value for similarity/containment
#[arg(short, long, default_value = "0.0")]
cutoff: f64,
/// Singleton mode, process each query sequence separately
#[arg(long, default_value = "false")]
singleton: bool,
},
/// Build and analyze hash bias tables for filtering
#[command(arg_required_else_help = true)]
Bias {
#[command(subcommand)]
command: BiasCommands,
},
/// Display statistics about a JAM database
#[command(arg_required_else_help = true)]
Stats {
/// Input JAM database (.jam file)
#[arg(short, long)]
input: PathBuf,
/// Short summary only
#[arg(long)]
short: bool,
/// Include the full entry statistics
#[arg(long)]
full: bool,
},
}
#[derive(Debug, Subcommand, Clone)]
pub enum BiasCommands {
/// Create a bias table from positive (target) and negative (background) FASTA files.
/// Target signal is always subtracted from background before computing bias weights.
#[command(arg_required_else_help = true)]
Create {
/// Positive (target) FASTA file(s) - sequences to enrich for
#[arg(long, required = true, num_args = 1..)]
positive: Vec<PathBuf>,
/// Negative (background) FASTA file(s) - sequences to deplete.
/// Target signal is subtracted from background automatically.
#[arg(long, required = true, num_args = 1..)]
negative: Vec<PathBuf>,
/// Output bias table file (.bias)
#[arg(short, long)]
output: PathBuf,
/// K-mer size (must match sketch k-mer size)
#[arg(short = 'k', long = "kmer-size", default_value = "21")]
kmer_size: u8,
/// FracMinHash scale (must match sketch fscale)
#[arg(long, default_value = "1000")]
fscale: u64,
/// Count-Min Sketch width (columns, power of 2 recommended)
#[arg(long, default_value = "1048576")]
cms_width: usize,
/// Count-Min Sketch depth (number of hash functions)
#[arg(long, default_value = "5")]
cms_depth: usize,
/// Smoothing parameter for log-ratio computation
#[arg(long, default_value = "1.0")]
alpha: f32,
/// Target fold enrichment. If not set, maximizes automatically.
/// A warning is shown if the requested value exceeds the maximum
/// achievable by the data.
#[arg(long)]
fold_enrichment: Option<f32>,
/// Number of threads to use for bias sketching
#[arg(long)]
threads: Option<usize>,
},
/// Display statistics for a bias table (.bias file)
#[command(arg_required_else_help = true)]
Stats {
/// Input bias table file (.bias)
input: PathBuf,
/// Output JSON report to file instead of stderr
#[arg(short, long)]
output: Option<PathBuf>,
},
}