use clap::*;
use intspan::*;
use petgraph::prelude::NodeIndex;
use petgraph::*;
use std::collections::{HashMap, HashSet};
use std::io::BufRead;
pub fn make_subcommand() -> Command {
Command::new("merge")
.about("Merge overlapped ranges via overlapping graph")
.after_help(
r###"
This command merges overlapping ranges from input files based on a specified coverage threshold.
It builds an overlapping graph for each chromosome and merges ranges that meet the coverage criteria.
Examples:
# Merge all ranges in the .tsv file with a coverage threshold of 0.98
rgr merge tests/rgr/II.links.tsv --coverage 0.98
# Enable verbose mode to see detailed processing information
rgr merge input1.rg input2.rg --coverage 0.95 --verbose
"###,
)
.arg(
Arg::new("infiles")
.required(true)
.num_args(1..)
.index(1)
.help("Input files to process. Multiple files can be specified"),
)
.arg(
Arg::new("coverage")
.long("coverage")
.short('c')
.num_args(1)
.default_value("0.95")
.value_parser(value_parser!(f32))
.help("Ranges with coverage larger than this value will be merged"),
)
.arg(
Arg::new("verbose")
.long("verbose")
.short('v')
.action(ArgAction::SetTrue)
.help("Enable verbose mode"),
)
.arg(
Arg::new("outfile")
.long("outfile")
.short('o')
.num_args(1)
.default_value("stdout")
.help("Output filename. [stdout] for screen"),
)
}
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let opt_coverage = *args.get_one::<f32>("coverage").unwrap();
let is_verbose = args.get_flag("verbose");
let mut graph_of_chr: HashMap<String, Graph<String, (), Undirected>> = HashMap::new();
let mut range_of_part: HashMap<String, Range> = HashMap::new();
let mut idx_of_part: HashMap<String, NodeIndex> = HashMap::new();
let mut chrs: HashSet<String> = HashSet::new();
for infile in args.get_many::<String>("infiles").unwrap() {
let reader = reader(infile);
for line in reader.lines().map_while(Result::ok) {
for part in line.split('\t') {
let range = Range::from_str(part);
if !range.is_valid() {
continue;
}
if range_of_part.contains_key(part) {
continue;
}
let chr = range.chr();
graph_of_chr
.entry(chr.to_string())
.or_insert_with(Graph::new_undirected);
chrs.insert(chr.to_string());
let idx = graph_of_chr
.get_mut(chr)
.unwrap()
.add_node(part.to_string());
idx_of_part.insert(part.to_string(), idx);
range_of_part.insert(part.to_string(), range);
}
} } let mut chrs = chrs.into_iter().collect::<Vec<String>>();
chrs.sort();
for chr in &chrs {
if is_verbose {
eprintln!("Chromosome {}", chr);
}
let graph = graph_of_chr.get_mut(chr).unwrap();
let indices = graph.node_indices().collect::<Vec<NodeIndex>>();
for i in 0..indices.len() {
let node_i = graph.node_weight(indices[i]).unwrap();
let intspan_i = range_of_part[node_i].intspan();
if is_verbose {
eprintln!(" Range {}/{}\t{}", i, indices.len(), node_i);
}
for j in i + 1..indices.len() {
let node_j = graph.node_weight(indices[j]).unwrap();
let intspan_j = range_of_part[node_j].intspan();
let intersect = intspan_i.intersect(&intspan_j);
if !intersect.is_empty() {
let coverage_i =
intersect.cardinality() as f32 / intspan_i.cardinality() as f32;
let coverage_j =
intersect.cardinality() as f32 / intspan_j.cardinality() as f32;
if coverage_i >= opt_coverage && coverage_j >= opt_coverage {
if is_verbose {
eprintln!(
" Merge with Range {}/{}\t{}",
j,
indices.len(),
node_j
);
}
graph.add_edge(indices[i], indices[j], ());
}
}
}
}
}
let mut out_lines: Vec<String> = Vec::new();
for chr in &chrs {
let graph = graph_of_chr.get(chr).unwrap();
let scc: Vec<Vec<NodeIndex>> = petgraph::algo::tarjan_scc(graph);
for cc_indices in &scc {
if cc_indices.len() < 2 {
continue;
}
if is_verbose {
eprintln!("Chromosome {}: Merge {} ranges", chr, cc_indices.len());
}
let mut part_list = cc_indices
.iter()
.map(|idx| graph.node_weight(*idx).unwrap().clone())
.collect::<Vec<String>>();
part_list.sort();
let mut intspan = IntSpan::new();
for part in &part_list {
let range = range_of_part.get(part).unwrap();
intspan.merge(&range.intspan());
}
let merged: String = format!("{}(+):{}", chr, intspan);
for part in &part_list {
if *part == merged {
continue;
}
let out_line = format!("{}\t{}", part, merged);
if is_verbose {
eprintln!("{}", out_line);
}
out_lines.push(out_line);
}
}
}
write_lines(args.get_one::<String>("outfile").unwrap(), &out_lines)?;
Ok(())
}