use clap::*;
use indexmap::IndexSet;
use intspan::*;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::BufRead;
pub fn make_subcommand<'a>() -> Command<'a> {
Command::new("covered")
.about("Covered regions from .ovlp.tsv files")
.arg(
Arg::new("infiles")
.help("Sets the input file to use")
.required(true)
.min_values(1)
.index(1),
)
.arg(
Arg::new("coverage")
.help("minimal coverage")
.long("coverage")
.short('c')
.takes_value(true)
.default_value("3")
.forbid_empty_values(true),
)
.arg(
Arg::new("len")
.help("minimal length of overlaps")
.long("len")
.short('l')
.takes_value(true)
.default_value("1000")
.forbid_empty_values(true),
)
.arg(
Arg::new("idt")
.help("minimal identities of overlaps")
.long("idt")
.short('i')
.takes_value(true)
.default_value("0.0")
.forbid_empty_values(true),
)
.arg(Arg::new("paf").long("paf").help("PAF as input format"))
.arg(
Arg::new("longest")
.long("longest")
.help("only keep the longest span"),
)
.arg(Arg::new("base").long("base").help("per base coverage"))
.arg(Arg::new("mean").long("mean").help("mean coverage"))
.arg(
Arg::new("outfile")
.short('o')
.long("outfile")
.takes_value(true)
.default_value("stdout")
.forbid_empty_values(true)
.help("Output filename. [stdout] for screen"),
)
}
pub fn execute(args: &ArgMatches) -> std::result::Result<(), Box<dyn std::error::Error>> {
let mut writer = writer(args.value_of("outfile").unwrap());
let coverage: i32 = args.value_of_t("coverage").unwrap_or_else(|e| {
eprintln!("Need a integer for --coverage\n{}", e);
std::process::exit(1)
});
let min_len: i32 = args.value_of_t("len").unwrap_or_else(|e| {
eprintln!("Need a integer for --len\n{}", e);
std::process::exit(1)
});
let min_idt: f32 = args.value_of_t("idt").unwrap_or_else(|e| {
eprintln!("Need a integer for --idt\n{}", e);
std::process::exit(1)
});
let is_paf = args.is_present("paf");
let is_longest = args.is_present("longest");
let is_base = args.is_present("base");
let is_mean = args.is_present("mean");
let mut res: HashMap<String, Coverage> = HashMap::new();
let mut index_of: IndexSet<String> = IndexSet::new();
let mut seen: HashSet<(usize, usize)> = HashSet::new();
for infile in args.values_of("infiles").unwrap() {
let reader = reader(infile);
for line in reader.lines().filter_map(|r| r.ok()) {
let ovlp = if is_paf {
Overlap::from_paf(&line)
} else {
Overlap::new(&line)
};
let f_id = ovlp.f_id();
let g_id = ovlp.g_id();
if f_id == g_id {
continue;
}
if *ovlp.len() < min_len {
continue;
}
if *ovlp.idt() < min_idt {
continue;
}
let (f_idx, _) = index_of.insert_full(f_id.clone());
let (g_idx, _) = index_of.insert_full(g_id.clone());
let tup = (f_idx.min(g_idx), f_idx.max(g_idx));
let not_seen = seen.insert(tup);
if !not_seen {
continue;
}
if !res.contains_key(f_id) {
let tiers = Coverage::new_len(coverage, *ovlp.f_len());
res.insert(f_id.clone(), tiers);
}
res.entry(f_id.to_string())
.and_modify(|e| e.bump(*ovlp.f_begin(), *ovlp.f_end()));
if !res.contains_key(g_id) {
let tiers = Coverage::new_len(coverage, *ovlp.g_len());
res.insert(g_id.clone(), tiers);
}
res.entry(g_id.to_string())
.and_modify(|e| e.bump(*ovlp.g_begin(), *ovlp.g_end()));
}
}
let mut keys = res.keys().map(|k| k.to_string()).collect::<Vec<String>>();
keys.sort();
for key in &keys {
let mut _out_line = String::new();
if is_base {
let tiers = res.get(key).unwrap().uniq_tiers();
_out_line = base_lines(key, &tiers);
} else if is_mean {
let tiers = res.get(key).unwrap().uniq_tiers();
_out_line = mean_line(key, &tiers);
} else {
let intspan = res.get(key).unwrap().max_tier();
if !is_longest || intspan.span_size() <= 1 {
_out_line = format!("{}:{}", key, intspan);
} else {
_out_line = longest_line(key, &intspan);
}
}
if !_out_line.is_empty() {
writer.write_all((_out_line + "\n").as_ref())?;
}
}
Ok(())
}
fn base_lines(key: &str, tiers: &BTreeMap<i32, IntSpan>) -> String {
let mut basecovs: HashMap<i32, i32> = HashMap::new();
let max_tier = tiers.keys().max().unwrap();
for i in 0..=*max_tier {
for pos in tiers[&i].elements() {
basecovs.insert(pos, i);
}
}
let mut sorted: Vec<i32> = basecovs.keys().copied().collect();
sorted.sort_unstable();
let mut out_lines: Vec<String> = vec![];
for pos in sorted {
let line = format!("{}\t{}\t{}", key, pos - 1, basecovs[&pos]);
out_lines.push(line);
}
out_lines.join("\n")
}
fn mean_line(key: &str, tiers: &BTreeMap<i32, IntSpan>) -> String {
let total_len = tiers[&-1].cardinality();
let max_tier = tiers.keys().max().unwrap();
let mut sum = 0;
for i in 0..=*max_tier {
sum += i * tiers[&i].cardinality();
}
let mean_cov = sum as f32 / total_len as f32;
format!("{}\t{}\t{:.1}", key, total_len, mean_cov)
}
fn longest_line(key: &str, intspan: &IntSpan) -> String {
let ranges = intspan.ranges();
let mut sizes: Vec<i32> = Vec::new();
for i in 0..intspan.span_size() {
let size = ranges[i * 2 + 1] - ranges[i * 2] + 1;
sizes.push(size);
}
let mut max_i = 0;
for i in 0..intspan.span_size() {
let size = sizes[i];
if size > sizes[max_i] {
max_i = i;
}
}
let mut longest = IntSpan::new();
longest.add_pair(ranges[max_i * 2], ranges[max_i * 2 + 1]);
format!("{}:{}", key, longest)
}