use anyhow::{anyhow, bail};
use bio_types::strand::Strand;
use clap::{arg, builder::ArgGroup, crate_authors, crate_version, value_parser, Command};
use csv::Error as CSVError;
use csv::ErrorKind;
use itertools::Itertools;
use mimalloc::MiMalloc;
use rand::Rng;
use slog::{crit, o, warn, Drain};
use std::path::PathBuf;
use alevin_fry::cellfilter::{generate_permit_list, CellFilterMethod};
use alevin_fry::cmd_parse_utils::{
pathbuf_directory_exists_validator, pathbuf_file_exists_validator,
};
use alevin_fry::prog_opts::{GenPermitListOpts, QuantOpts};
use alevin_fry::quant::{ResolutionStrategy, SplicedAmbiguityModel};
#[global_allocator]
static GLOBAL: MiMalloc = MiMalloc;
const VERSION: &str = env!("CARGO_PKG_VERSION");
#[allow(dead_code)]
fn gen_random_kmer(k: usize) -> String {
const CHARSET: &[u8] = b"ACGT";
let mut rng = rand::thread_rng();
let s: String = (0..k)
.map(|_| {
let idx = rng.gen_range(0..CHARSET.len());
CHARSET[idx] as char
})
.collect();
s
}
#[allow(clippy::manual_clamp)]
fn main() -> anyhow::Result<()> {
let num_hardware_threads = num_cpus::get() as u32;
let max_num_threads: String = (num_cpus::get() as u32).to_string();
let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string();
let crate_authors = crate_authors!("\n");
let version = crate_version!();
let cmdline = std::env::args().join(" ");
let convert_app = Command::new("convert")
.about("Convert a BAM file to a RAD file")
.version(version)
.author(crate_authors)
.arg(
arg!(-b --bam <BAMFILE> "input SAM/BAM file")
.required(true)
.value_parser(pathbuf_file_exists_validator),
)
.arg(
arg!(-t --threads <THREADS> "number of threads to use for processing")
.required(false)
.value_parser(value_parser!(u32))
.default_value(max_num_threads.clone()),
)
.arg(
arg!(-o --output <RADFILE> "output RAD file")
.required(true)
.value_parser(value_parser!(PathBuf)),
);
let view_app = Command::new("view")
.about("View a RAD file")
.version(version)
.author(crate_authors)
.arg(
arg!(-r --rad <RADFILE> "input RAD file")
.required(true)
.value_parser(pathbuf_file_exists_validator),
)
.arg(arg!(-H --header "flag for printing header"));
let gen_app = Command::new("generate-permit-list")
.about("Generate a permit list of barcodes from a RAD file")
.version(version)
.author(crate_authors)
.arg(arg!(-i --input <INPUT> "input directory containing the map.rad RAD file")
.required(true)
.value_parser(pathbuf_directory_exists_validator))
.arg(arg!(-d --"expected-ori" <EXPECTEDORI> "the expected orientation of alignments")
.required(true)
.ignore_case(true)
.value_parser(["fw", "rc", "both", "either"]))
.arg(arg!(-o --"output-dir" <OUTPUTDIR> "output directory").required(true).value_parser(value_parser!(PathBuf)))
.arg(arg!(
-k --"knee-distance" "attempt to determine the number of barcodes to keep using the knee distance method."
)
)
.arg(arg!(-e --"expect-cells" <EXPECTCELLS> "defines the expected number of cells to use in determining the (read, not UMI) based cutoff")
.value_parser(value_parser!(usize)))
.arg(arg!(-f --"force-cells" <FORCECELLS> "select the top-k most-frequent barcodes, based on read count, as valid (true)")
.value_parser(value_parser!(usize)))
.arg(
arg!(-b --"valid-bc" <VALIDBC> "uses true barcode collected from a provided file")
.value_parser(pathbuf_file_exists_validator)
)
.arg(
arg!(-u --"unfiltered-pl" <UNFILTEREDPL> "uses an unfiltered external permit list")
.value_parser(pathbuf_file_exists_validator)
)
.group(ArgGroup::new("filter-method")
.args(["knee-distance", "expect-cells", "force-cells", "valid-bc", "unfiltered-pl"])
.required(true)
)
.arg(
arg!(-m --"min-reads" <MINREADS> "minimum read count threshold; only used with --unfiltered-pl")
.value_parser(value_parser!(usize))
.default_value("10"));
let collate_app = Command::new("collate")
.about("Collate a RAD file by corrected cell barcode")
.version(version)
.author(crate_authors)
.arg(arg!(-i --"input-dir" <INPUTDIR> "input directory made by generate-permit-list")
.required(true)
.value_parser(pathbuf_directory_exists_validator))
.arg(arg!(-r --"rad-dir" <RADFILE> "the directory containing the RAD file to be collated")
.required(true)
.value_parser(pathbuf_directory_exists_validator))
.arg(arg!(-t --threads <THREADS> "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_collate_threads))
.arg(arg!(-c --compress "compress the output collated RAD file"))
.arg(arg!(-m --"max-records" <MAXRECORDS> "the maximum number of read records to keep in memory at once")
.value_parser(value_parser!(u32))
.default_value("30000000"));
let quant_app = Command::new("quant")
.about("Quantify expression from a collated RAD file")
.version(version)
.author(crate_authors)
.arg(arg!(-i --"input-dir" <INPUTDIR> "input directory containing collated RAD file")
.required(true)
.value_parser(pathbuf_directory_exists_validator))
.arg(arg!(-m --"tg-map" <TGMAP> "transcript to gene map").required(true).value_parser(pathbuf_file_exists_validator))
.arg(arg!(-o --"output-dir" <OUTPUTDIR> "output directory where quantification results will be written").required(true).value_parser(value_parser!(PathBuf)))
.arg(arg!(-t --threads <THREADS> "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_threads.clone()))
.arg(arg!(-d --"dump-eqclasses" "flag for dumping equivalence classes"))
.arg(arg!(-b --"num-bootstraps" <NUMBOOTSTRAPS> "number of bootstraps to use").value_parser(value_parser!(u32)).default_value("0"))
.arg(arg!(--"init-uniform" "flag for uniform sampling").requires("num-bootstraps"))
.arg(arg!(--"summary-stat" "flag for storing only summary statistics").requires("num-bootstraps"))
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market format (default)"))
.arg(arg!(--"use-eds" "flag for writing output matrix in EDS format").conflicts_with("use-mtx"))
.arg(arg!(--"quant-subset" <SFILE> "file containing list of barcodes to quantify, those not in this list will be ignored").value_parser(pathbuf_file_exists_validator))
.arg(arg!(-r --resolution <RESOLUTION> "the resolution strategy by which molecules will be counted")
.required(true)
.ignore_case(true)
.value_parser(value_parser!(ResolutionStrategy)))
.arg(arg!(--"sa-model" <SAMODEL> "preferred model of splicing ambiguity")
.ignore_case(true)
.value_parser(value_parser!(SplicedAmbiguityModel))
.default_value("winner-take-all")
.hide(true))
.arg(arg!(--"umi-edit-dist" <EDIST> "the Hamming distance within which potentially colliding UMIs will be considered for correction")
.value_parser(value_parser!(u32))
.default_value_ifs([
("resolution", "cr-like", Some("0")),
("resolution", "cr-like-em", Some("0")),
("resolution", "trivial", Some("0")),
("resolution", "parsimony", Some("1")),
("resolution", "parsimony-em", Some("1")),
("resolution", "parsimony-gene", Some("1")),
("resolution", "parsimony-gene-em", Some("1")),
])
.hide(true))
.arg(arg!(--"large-graph-thresh" <NVERT> "the order (number of nodes) of a PUG above which the alternative resolution strategy will be applied")
.value_parser(value_parser!(usize))
.default_value_ifs([
("resolution", "parsimony-gene-em", Some("1000")),
("resolution", "parsimony", Some("1000")),
("resolution", "parsimony-em", Some("1000")),
("resolution", "parsimony-gene", Some("1000")),
])
.default_value("0") .hide(true))
.arg(arg!(--"small-thresh" <SMALLTHRESH> "cells with fewer than these many reads will be resolved using a custom approach")
.value_parser(value_parser!(usize))
.default_value("10")
.hide(true));
let infer_app = Command::new("infer")
.about("Perform inference on equivalence class count data")
.version(version)
.author(crate_authors)
.arg(arg!(-c --"count-mat" <EQCMAT> "matrix of cells by equivalence class counts")
.required(true)
.value_parser(pathbuf_file_exists_validator))
.arg(arg!(-e --"eq-labels" <EQLABELS> "file containing the gene labels of the equivalence classes")
.required(true)
.value_parser(pathbuf_file_exists_validator))
.arg(arg!(-o --"output-dir" <OUTPUTDIR> "output directory where quantification results will be written").required(true).value_parser(value_parser!(PathBuf)))
.arg(arg!(-t --threads <THREADS> "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_threads))
.arg(arg!(--usa "flag specifying that input equivalence classes were computed in USA mode"))
.arg(arg!(--"quant-subset" <SFILE> "file containing list of barcodes to quantify, those not in this list will be ignored").value_parser(pathbuf_file_exists_validator))
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market format (default)"))
.arg(arg!(--"use-eds" "flag for writing output matrix in EDS format").conflicts_with("use-mtx"));
let opts = Command::new("alevin-fry")
.subcommand_required(true)
.arg_required_else_help(true)
.version(version)
.author(crate_authors)
.about("Process RAD files from the command line")
.subcommand(gen_app)
.subcommand(collate_app)
.subcommand(quant_app)
.subcommand(infer_app)
.subcommand(convert_app)
.subcommand(view_app)
.get_matches();
let decorator = slog_term::TermDecorator::new().build();
let drain = slog_term::CompactFormat::new(decorator)
.use_custom_timestamp(|out: &mut dyn std::io::Write| {
write!(out, "{}", chrono::Local::now().format("%Y-%m-%d %H:%M:%S")).unwrap();
Ok(())
})
.build()
.fuse();
let drain = slog_async::Async::new(drain).build().fuse();
let log = slog::Logger::root(drain, o!());
if let Some(t) = opts.subcommand_matches("generate-permit-list") {
let input_dir: &PathBuf = t.get_one("input").expect("no input directory specified");
let output_dir: &PathBuf = t
.get_one("output-dir")
.expect("no output directory specified");
let valid_ori: bool;
let expected_ori = match t
.get_one::<String>("expected-ori")
.unwrap()
.to_uppercase()
.as_str()
{
"RC" => {
valid_ori = true;
Strand::Reverse
}
"FW" => {
valid_ori = true;
Strand::Forward
}
"BOTH" => {
valid_ori = true;
Strand::Unknown
}
"EITHER" => {
valid_ori = true;
Strand::Unknown
}
_ => {
valid_ori = false;
Strand::Unknown
}
};
if !valid_ori {
crit!(
log,
"{} is not a valid option for --expected-ori",
expected_ori
);
std::process::exit(1);
}
let mut fmeth = CellFilterMethod::KneeFinding;
let _expect_cells: Option<usize> = match t.get_one::<usize>("expect-cells") {
Some(v) => {
fmeth = CellFilterMethod::ExpectCells(*v);
Some(*v)
}
None => None,
};
if t.get_flag("knee-distance") {
fmeth = CellFilterMethod::KneeFinding;
}
let _force_cells = match t.get_one::<usize>("force-cells") {
Some(v) => {
fmeth = CellFilterMethod::ForceCells(*v);
Some(*v)
}
None => None,
};
let _valid_bc = match t.get_one::<PathBuf>("valid-bc") {
Some(v) => {
fmeth = CellFilterMethod::ExplicitList(v.clone());
Some(v)
}
None => None,
};
if let Some(v) = t.get_one::<PathBuf>("unfiltered-pl") {
let min_reads: usize = *t
.get_one("min-reads")
.expect("min-reads must be a valid integer");
if min_reads < 1 {
crit!(
log,
"min-reads < 1 is not supported, the value {} was provided",
min_reads
);
std::process::exit(1);
}
fmeth = CellFilterMethod::UnfilteredExternalList(v.clone(), min_reads);
};
let velo_mode = false;
let gpl_opts = GenPermitListOpts::builder()
.input_dir(input_dir)
.output_dir(output_dir)
.fmeth(fmeth)
.expected_ori(expected_ori)
.version(VERSION)
.velo_mode(velo_mode)
.cmdline(&cmdline)
.log(&log)
.build();
match generate_permit_list(gpl_opts) {
Ok(nc) if nc == 0 => {
warn!(log, "found 0 corrected barcodes; please check the input.");
}
Err(e) => return Err(e),
_ => (),
};
}
if let Some(t) = opts.subcommand_matches("convert") {
let input_file: &PathBuf = t.get_one("bam").unwrap();
let rad_file: &PathBuf = t.get_one("output").unwrap();
let num_threads: u32 = *t.get_one("threads").unwrap();
alevin_fry::convert::bam2rad(input_file, rad_file, num_threads, &log)
}
if let Some(t) = opts.subcommand_matches("view") {
let rad_file: &PathBuf = t.get_one("rad").unwrap();
let print_header = t.get_flag("header");
alevin_fry::convert::view(rad_file, print_header, &log)
}
if let Some(t) = opts.subcommand_matches("collate") {
let input_dir: &PathBuf = t.get_one("input-dir").unwrap();
let rad_dir: &PathBuf = t.get_one("rad-dir").unwrap();
let num_threads = *t.get_one("threads").unwrap();
let compress_out = t.get_flag("compress");
let max_records: u32 = *t.get_one("max-records").unwrap();
alevin_fry::collate::collate(
input_dir,
rad_dir,
num_threads,
max_records,
compress_out,
&cmdline,
VERSION,
&log,
)
.expect("could not collate.");
}
if let Some(t) = opts.subcommand_matches("quant") {
let num_threads = *t.get_one("threads").unwrap();
let num_bootstraps = *t.get_one("num-bootstraps").unwrap();
let init_uniform = t.get_flag("init-uniform");
let summary_stat = t.get_flag("summary-stat");
let dump_eq = t.get_flag("dump-eqclasses");
let use_mtx = !t.get_flag("use-eds");
let input_dir: &PathBuf = t.get_one("input-dir").unwrap();
let output_dir: &PathBuf = t.get_one("output-dir").unwrap();
let tg_map: &PathBuf = t.get_one("tg-map").unwrap();
let resolution = *t.get_one::<ResolutionStrategy>("resolution").unwrap();
let sa_model = *t.get_one::<SplicedAmbiguityModel>("sa-model").unwrap();
let small_thresh = *t.get_one("small-thresh").unwrap();
let filter_list: Option<&PathBuf> = t.get_one("quant-subset");
let large_graph_thresh: usize = *t.get_one("large-graph-thresh").unwrap();
let umi_edit_dist: u32 = *t.get_one("umi-edit-dist").unwrap();
let mut pug_exact_umi = false;
match umi_edit_dist {
0 => {
match resolution {
ResolutionStrategy::Trivial
| ResolutionStrategy::CellRangerLike
| ResolutionStrategy::CellRangerLikeEm => {
}
ResolutionStrategy::Parsimony
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGene
| ResolutionStrategy::ParsimonyGeneEm => {
pug_exact_umi = true;
}
}
}
1 => {
match resolution {
ResolutionStrategy::Trivial
| ResolutionStrategy::CellRangerLike
| ResolutionStrategy::CellRangerLikeEm => {
crit!(
log,
"\n\nResolution strategy {:?} doesn't currently support 1-edit UMI resolution",
resolution
);
bail!("Invalid command line option");
}
ResolutionStrategy::Parsimony
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGene
| ResolutionStrategy::ParsimonyGeneEm => {
pug_exact_umi = false;
}
}
}
j => {
crit!(
log,
"\n\nResolution strategy {:?} doesn't currently support {}-edit UMI resolution",
resolution,
j
);
bail!("Invalid command line option");
}
}
if dump_eq && (resolution == ResolutionStrategy::Trivial) {
crit!(
log,
"\n\nGene equivalence classes are not meaningful in case of Trivial resolution."
);
std::process::exit(1);
}
if num_bootstraps > 0 {
match resolution {
ResolutionStrategy::CellRangerLikeEm
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGeneEm => {
}
_ => {
eprintln!(
"\n\nThe num_bootstraps argument was set to {}, but bootstrapping can only be used with the cr-like-em, parsimony-em, or parsimony-gene-em resolution strategies",
num_bootstraps
);
std::process::exit(1);
}
}
}
let parent = std::path::Path::new(&input_dir);
let json_path = parent.join("generate_permit_list.json");
let quant_opts = QuantOpts::builder()
.input_dir(input_dir)
.tg_map(tg_map)
.output_dir(output_dir)
.num_threads(num_threads)
.num_bootstraps(num_bootstraps)
.init_uniform(init_uniform)
.summary_stat(summary_stat)
.dump_eq(dump_eq)
.use_mtx(use_mtx)
.resolution(resolution)
.sa_model(sa_model)
.small_thresh(small_thresh)
.large_graph_thresh(large_graph_thresh)
.filter_list(filter_list)
.pug_exact_umi(pug_exact_umi)
.cmdline(&cmdline)
.version(VERSION)
.log(&log)
.build();
if json_path.exists() {
let velo_mode = alevin_fry::utils::is_velo_mode(quant_opts.input_dir);
if velo_mode {
match alevin_fry::quant::velo_quantify(quant_opts) {
Ok(_) => {}
Err(e) => match e.downcast_ref::<CSVError>() {
Some(error) => {
match *error.kind() {
ErrorKind::Deserialize { .. } => {
return Err(anyhow!("execution terminated unexpectedly"));
}
_ => {
panic!("could not quantify rad file.");
}
}
}
None => {
panic!("could not quantify rad file.");
}
},
}; } else {
match alevin_fry::quant::quantify(quant_opts) {
Ok(_) => {}
Err(e) => match e.downcast_ref::<CSVError>() {
Some(error) => {
match *error.kind() {
ErrorKind::Deserialize { .. } => {
return Err(anyhow!("execution terminated unexpectedly"));
}
_ => {
panic!("could not quantify rad file.");
}
}
}
None => {
panic!("could not quantify rad file.");
}
},
}; }; } else {
crit!(log,
"The provided input directory lacks a generate_permit_list.json file; this should not happen."
);
bail!("The provided input directory lacks a generate_permit_list.json file; this should not happen.");
}
}
if let Some(t) = opts.subcommand_matches("infer") {
let num_threads = *t.get_one("threads").unwrap();
let use_mtx = !t.get_flag("use-eds");
let output_dir = t.get_one("output-dir").unwrap();
let count_mat = t.get_one("count-mat").unwrap();
let eq_label_file = t.get_one("eq-labels").unwrap();
let filter_list: Option<&PathBuf> = t.get_one("quant-subset");
let usa_mode = t.get_flag("usa");
alevin_fry::infer::infer(
count_mat,
eq_label_file,
usa_mode,
use_mtx,
num_threads,
filter_list,
output_dir,
&log,
)
.expect("could not perform inference from equivalence class counts.");
}
Ok(())
}