mod cli;
mod processor;
mod utils;
mod hasher;
use crate::hasher::HashType;
use crate::utils::{
birthday_problem_square_approximation, estimate_sequence_capacity, get_hash_method,
};
use crate::processor::{execute_deduplication, execute_paired_deduplication, DedupConfig};
use anyhow::{bail, Context, Result};
use clap::Parser;
use cli::Cli;
use std::path::Path;
use std::time::Instant;
fn dispatch(
input_path: &str,
output_path: &str,
hash_type: &HashType,
cfg: &DedupConfig,
) -> Result<(usize, usize)> {
match hash_type {
HashType::XXH3_64 => {
if cfg.verbose {
println!("Single-End Mode: 64-bit Hash");
}
execute_deduplication::<u64>(input_path, output_path, cfg)
}
HashType::XXH3_128 => {
if cfg.verbose {
println!("Single-End Mode: 128-bit Hash");
}
execute_deduplication::<u128>(input_path, output_path, cfg)
}
}
}
fn dispatch_paired(
input_r1: &str,
input_r2: &str,
output_r1: &str,
output_r2: &str,
hash_type: &HashType,
cfg: &DedupConfig,
) -> Result<(usize, usize)> {
match hash_type {
HashType::XXH3_64 => {
if cfg.verbose {
println!("Paired-End Mode: 64-bit Combined Hash");
}
execute_paired_deduplication::<u64>(input_r1, input_r2, output_r1, output_r2, cfg)
}
HashType::XXH3_128 => {
if cfg.verbose {
println!("Paired-End Mode: 128-bit Combined Hash");
}
execute_paired_deduplication::<u128>(input_r1, input_r2, output_r1, output_r2, cfg)
}
}
}
fn main() -> Result<()> {
let args = Cli::parse();
if !Path::new(&args.input).exists() {
bail!("Input file not found: {}", args.input);
}
if let Some(ref r2) = args.input_r2 {
if !Path::new(r2).exists() {
bail!("R2 input file not found: {}", r2);
}
}
if args.hash.is_some() && args.threshold != 0.001 {
eprintln!(
"Warning: --hash specifies a hash size, so the automatic selection threshold ({}) is ignored.",
args.threshold
);
}
if args.verbose {
println!("Primary input file (R1): {}", args.input);
println!("Primary output file (R1): {}", args.output);
println!("Expected read length: {} bp", args.read_length);
}
let cap_input = estimate_sequence_capacity(&args.input, args.read_length)
.context("Failed to estimate input file capacity")?;
let cap_output = if args.force {
0
} else {
estimate_sequence_capacity(&args.output, args.read_length).unwrap_or(0)
};
let total_capacity = cap_input + cap_output;
let selected_hash_type = match args.hash {
Some(mode) => HashType::from(mode),
None => get_hash_method(total_capacity, args.threshold),
};
if args.verbose {
println!(
"Total estimated hash table capacity: {} fragments",
total_capacity
);
}
let cfg = DedupConfig {
force: args.force,
verbose: args.verbose,
dry_run: args.dry_run,
compression_level: args.compression,
estimated_capacity: total_capacity,
read_length: args.read_length,
};
let start = Instant::now();
let (processed, duplicates) = if let Some(input_r2) = &args.input_r2 {
let output_r2 = args
.output_r2
.as_ref()
.context("The --output-r2 (-p) argument is required when --input-r2 (-2) is used.")?;
if args.verbose {
println!("Secondary input file (R2): {}", input_r2);
println!("Secondary output file (R2): {}", output_r2);
println!("--- Starting Paired-End Processing ---");
}
dispatch_paired(
&args.input,
input_r2,
&args.output,
output_r2,
&selected_hash_type,
&cfg,
)?
} else {
if args.verbose {
println!("--- Starting Single-End Processing ---");
}
dispatch(&args.input, &args.output, &selected_hash_type, &cfg)?
};
if args.verbose {
println!(
"Processed fragments: {}\nDuplicates removed: {:.2}%",
processed,
if processed > 0 {
duplicates as f64 / processed as f64 * 100.0
} else {
0.0
}
);
println!(
"Estimated collisions: {:.2e}%",
birthday_problem_square_approximation(processed, &selected_hash_type) * 100.0
);
println!("Total execution time: {:.2?}", start.elapsed());
}
Ok(())
}