use crate::utils::expand_spaced_seed_mask;
use crate::{construct_seed_template, parse_binary};
use clap::Parser;
use seqkmer::Meros;
use seqkmer::{
BITS_PER_CHAR, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES,
DEFAULT_TOGGLE_MASK,
};
use std::path::PathBuf;
pub const U32MAXPLUS: u64 = u32::MAX as u64;
pub const ONEGB: u64 = 1073741824;
#[derive(Parser, Debug, Clone)]
#[clap(version, about = "build database")]
pub struct Build {
#[arg(long = "db", required = true)]
pub database: PathBuf,
#[clap(flatten)]
pub klmt: KLMTArgs,
#[clap(short, long, value_parser = clap::value_parser!(u8).range(0..31), default_value_t = 0)]
pub requested_bits_for_taxid: u8,
#[clap(short = 'p', long, default_value_t = num_cpus::get())]
pub threads: usize,
}
const BUFFER_SIZE: usize = 16 * 1024 * 1024;
#[derive(Parser, Debug, Clone)]
#[clap(
version,
about = "Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences",
long_about = "classify a set of sequences"
)]
pub struct ClassifyArgs {
#[arg(long = "db", required = true)]
pub database: PathBuf,
#[clap(long)]
pub chunk_dir: PathBuf,
#[clap(long = "output-dir", value_parser)]
pub output_dir: Option<PathBuf>,
#[clap(short = 'P', long = "paired-end-processing", action)]
pub paired_end_processing: bool,
#[clap(short = 'S', long = "single-file-pairs", action)]
pub single_file_pairs: bool,
#[clap(
short = 'Q',
long = "minimum-quality-score",
value_parser,
default_value_t = 0
)]
pub minimum_quality_score: i32,
#[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())]
pub num_threads: usize,
#[clap(long, default_value_t = BUFFER_SIZE)]
pub buffer_size: usize,
#[clap(long, value_parser = clap::value_parser!(u32).range(1..=32), default_value_t = 4)]
pub batch_size: u32,
#[clap(
short = 'T',
long = "confidence-threshold",
value_parser,
default_value_t = 0.0
)]
pub confidence_threshold: f64,
#[clap(
short = 'g',
long = "minimum-hit-groups",
value_parser,
default_value_t = 2
)]
pub minimum_hit_groups: usize,
#[clap(short = 'K', long, value_parser, default_value_t = false)]
pub report_kmer_data: bool,
#[clap(short = 'z', long, value_parser, default_value_t = false)]
pub report_zero_counts: bool,
pub input_files: Vec<PathBuf>,
}
#[derive(Parser, Debug, Clone, Copy)]
#[clap(version, about = "k-mer")]
pub struct KLMTArgs {
#[clap(short, long, value_parser = clap::value_parser!(u64).range(1..), default_value_t = DEFAULT_KMER_LENGTH)]
pub k_mer: u64,
#[clap(short, long, value_parser = clap::value_parser!(u8).range(1..=31), default_value_t = DEFAULT_MINIMIZER_LENGTH)]
pub l_mer: u8,
#[clap(long, default_value_t = DEFAULT_MINIMIZER_SPACES)]
pub minimizer_spaces: u8,
#[clap(short = 'T', long, default_value_t = DEFAULT_TOGGLE_MASK)]
pub toggle_mask: u64,
#[clap(long)]
pub min_clear_hash_value: Option<u64>,
}
impl KLMTArgs {
pub fn as_meros(&self) -> Meros {
let seed = construct_seed_template(self.l_mer as usize, self.minimizer_spaces as usize);
let space_seed_mask = parse_binary(&seed).unwrap();
let space_seed_mask = expand_spaced_seed_mask(space_seed_mask, BITS_PER_CHAR as u64);
Meros::new(
self.k_mer as usize,
self.l_mer as usize,
Some(space_seed_mask),
Some(self.toggle_mask),
self.min_clear_hash_value,
)
}
}
pub fn parse_size(s: &str) -> Result<usize, String> {
let len = s.len();
if len < 2 {
return Err("Size must be at least two characters".to_string());
}
let (num, suffix) = s.split_at(len - 1);
let number: f64 = num.parse().map_err(|_| "Invalid number".to_string())?;
match suffix {
"G" | "g" => Ok((number * 1_073_741_824.0) as usize), "M" | "m" => Ok((number * 1_048_576.0) as usize), "K" | "k" => Ok((number * 1_024.0) as usize), _ => Err("Invalid size suffix. Use 'G', 'M', or 'K'".to_string()),
}
}