use clap::*;
use intspan::*;
use petgraph::prelude::NodeIndex;
use petgraph::*;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::io::BufRead;
pub fn make_subcommand() -> Command {
Command::new("clean")
.about("Replace ranges within links, incorporate hit strands and remove nested links")
.after_help(
r###"
* <infiles> are bilateral links files, with or without hit strands
"###,
)
.arg(
Arg::new("infiles")
.required(true)
.num_args(1..)
.index(1)
.help("Set the input files to use"),
)
.arg(
Arg::new("replace")
.long("replace")
.short('r')
.num_args(1)
.help("Two-column tsv file, normally produced by command merge"),
)
.arg(
Arg::new("bundle")
.long("bundle")
.short('b')
.num_args(1)
.value_parser(value_parser!(i32))
.default_value("0")
.help("Bundle overlapped links. This value is the overlapping size. Suggested value is [500]"),
)
.arg(
Arg::new("verbose")
.long("verbose")
.short('v')
.action(ArgAction::SetTrue)
.help("Verbose mode"),
)
.arg(
Arg::new("outfile")
.long("outfile")
.short('o')
.num_args(1)
.default_value("stdout")
.help("Output filename. [stdout] for screen"),
)
}
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let bundle = *args.get_one::<i32>("bundle").unwrap();
let is_verbose = args.get_flag("verbose");
let mut range_of_part: HashMap<String, Range> = HashMap::new();
let mut replaces: HashMap<String, String> = HashMap::new();
if args.contains_id("replace") {
if is_verbose {
eprintln!("==> Load replaces");
}
for line in read_lines(args.get_one::<String>("replace").unwrap()) {
build_range_of_part(&line, &mut range_of_part);
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() == 2 {
replaces.insert(parts[0].to_string(), parts[1].to_string());
}
}
}
if is_verbose {
eprintln!("==> Replacing and incorporating strands");
}
let mut line_set: BTreeSet<String> = BTreeSet::new();
for infile in args.get_many::<String>("infiles").unwrap() {
let reader = reader(infile);
for line in reader.lines().map_while(Result::ok) {
build_range_of_part(&line, &mut range_of_part);
let mut parts: Vec<String> = line.split('\t').map(String::from).collect();
let count = parts.len();
if !(count == 2 || count == 3) {
continue;
}
if !range_of_part.contains_key(&parts[0]) {
continue;
}
if !range_of_part.contains_key(&parts[1]) {
continue;
}
for i in 0..count {
let original = parts[i].to_string();
if replaces.contains_key(&original) {
let mut new_range = range_of_part[&replaces[&original]].clone();
*new_range.strand_mut() = range_of_part[&original].strand().to_string();
let new_part = new_range.to_string();
build_range_of_part(&new_part, &mut range_of_part);
*parts.get_mut(i).unwrap() = new_part;
}
}
let mut strands: BTreeSet<String> = BTreeSet::new();
if count == 3 && (parts[2] == "+" || parts[2] == "-") {
strands.insert(parts[2].to_string());
}
for i in &[0, 1] {
strands.insert(range_of_part[&parts[*i as usize]].strand().to_string());
}
let mut range_0 = range_of_part[&parts[0]].clone();
let mut range_1 = range_of_part[&parts[1]].clone();
if range_0.chr() == range_1.chr()
&& range_0.start() == range_1.start()
&& range_0.end() == range_1.end()
{
continue;
}
if strands.len() == 1 {
*range_0.strand_mut() = "+".to_string();
*range_1.strand_mut() = "+".to_string();
} else {
*range_0.strand_mut() = "+".to_string();
*range_1.strand_mut() = "-".to_string();
}
let new_line = format!("{}\t{}", range_0, range_1);
build_range_of_part(&new_line, &mut range_of_part);
line_set.insert(new_line);
}
}
let mut lines = line_set
.iter()
.map(String::to_string)
.collect::<Vec<String>>();
lines = sort_links(&lines);
let mut is_nested = true;
while is_nested {
if is_verbose {
eprintln!("==> Remove nested links");
}
let mut to_remove: HashSet<String> = HashSet::new();
let chr_pairs = lines
.iter()
.map(|line| {
let parts: Vec<&str> = line.split('\t').collect();
format!(
"{}:{}",
range_of_part[parts[0]].chr(),
range_of_part[parts[1]].chr()
)
})
.collect::<Vec<String>>();
for i in 0..lines.len() {
let cur_pair = &chr_pairs[i];
let rest_idx: Vec<usize> = (i + 1..lines.len())
.filter(|key| chr_pairs[*key] == *cur_pair)
.collect();
for j in rest_idx {
let line_i = &lines[i];
let parts_i: Vec<&str> = line_i.split('\t').collect();
let line_j = &lines[j];
let parts_j: Vec<&str> = line_j.split('\t').collect();
let intspan_0_i = range_of_part[parts_i[0]].intspan();
let intspan_1_i = range_of_part[parts_i[1]].intspan();
let intspan_0_j = range_of_part[parts_j[0]].intspan();
let intspan_1_j = range_of_part[parts_j[1]].intspan();
if intspan_0_i.superset(&intspan_0_j) && intspan_1_i.superset(&intspan_1_j) {
to_remove.insert(line_j.to_string());
} else if intspan_0_j.superset(&intspan_0_i) && intspan_1_j.superset(&intspan_1_i) {
to_remove.insert(line_i.to_string());
}
}
}
lines = lines
.iter()
.filter(|key| !to_remove.contains(*key))
.map(String::to_string)
.collect::<Vec<String>>();
is_nested = !to_remove.is_empty();
}
lines = sort_links(&lines);
if bundle != 0 {
if is_verbose {
eprintln!("==> Bundle overlapped links");
}
let chr_strand_pairs = lines
.iter()
.map(|line| {
let parts: Vec<&str> = line.split('\t').collect();
format!(
"{}:{}:{}:{}",
range_of_part[parts[0]].chr(),
range_of_part[parts[0]].strand(),
range_of_part[parts[1]].chr(),
range_of_part[parts[1]].strand(),
)
})
.collect::<Vec<String>>();
let mut graph: Graph<String, (), Undirected> = Graph::new_undirected();
let mut idx_of_line: HashMap<String, NodeIndex> = HashMap::new();
for i in 0..lines.len() {
let cur_pair = &chr_strand_pairs[i];
let rest_idx: Vec<usize> = (i + 1..lines.len())
.filter(|key| chr_strand_pairs[*key] == *cur_pair)
.collect();
for j in rest_idx {
let line_i = &lines[i];
let parts_i: Vec<&str> = line_i.split('\t').collect();
let line_j = &lines[j];
let parts_j: Vec<&str> = line_j.split('\t').collect();
if !idx_of_line.contains_key(line_i) {
let idx = graph.add_node(line_i.to_string());
idx_of_line.insert(line_i.to_string(), idx);
}
if !idx_of_line.contains_key(line_j) {
let idx = graph.add_node(line_j.to_string());
idx_of_line.insert(line_j.to_string(), idx);
}
let intspan_0_i = range_of_part[parts_i[0]].intspan();
let intspan_1_i = range_of_part[parts_i[1]].intspan();
let intspan_0_j = range_of_part[parts_j[0]].intspan();
let intspan_1_j = range_of_part[parts_j[1]].intspan();
if intspan_0_i.intersect(&intspan_0_j).cardinality() >= bundle
&& intspan_1_i.intersect(&intspan_1_j).cardinality() >= bundle
{
graph.add_edge(idx_of_line[line_i], idx_of_line[line_j], ());
}
}
}
let scc: Vec<Vec<NodeIndex>> = petgraph::algo::tarjan_scc(&graph);
for connected_indices in &scc {
if connected_indices.len() < 2 {
continue;
}
if is_verbose {
eprintln!("Merge {} lines", connected_indices.len());
}
let mut line_list = connected_indices
.iter()
.map(|idx| graph.node_weight(*idx).unwrap().clone())
.collect::<Vec<String>>();
line_list.sort();
if is_verbose {
eprintln!("line_list = {:#?}", line_list);
}
let mut merged_ranges: Vec<String> = Vec::new();
for i in 0..=1 {
let mut chr = "".to_string();
let mut strand = "".to_string();
let mut intspan = IntSpan::new();
for line in &line_list {
lines = lines
.iter()
.filter(|key| *key != line)
.map(String::to_string)
.collect::<Vec<String>>();
let parts: Vec<&str> = line.split('\t').collect();
let range = range_of_part.get(parts[i]).unwrap();
chr = range.chr().to_string();
strand = range.strand().to_string();
intspan.merge(&range.intspan());
}
let merged: String =
format!("{}({}):{}-{}", chr, strand, intspan.min(), intspan.max());
merged_ranges.push(merged);
}
let new_line = merged_ranges.join("\t");
build_range_of_part(&new_line, &mut range_of_part);
if is_verbose {
eprintln!(" To {}", new_line);
}
lines.push(new_line);
}
lines = sort_links(&lines);
}
if args.contains_id("replace") {
if is_verbose {
eprintln!("==> Remove self links");
}
let same_chr_lines = lines
.iter()
.filter(|line| {
let parts: Vec<&str> = line.split('\t').collect();
range_of_part[parts[0]].chr() == range_of_part[parts[1]].chr()
})
.map(String::to_string)
.collect::<Vec<String>>();
for line in &same_chr_lines {
let parts: Vec<&str> = line.split('\t').collect();
let intspan_0 = range_of_part[parts[0]].intspan();
let intspan_1 = range_of_part[parts[1]].intspan();
let intersect = intspan_0.intersect(&intspan_1);
if !intersect.is_empty()
&& intersect.cardinality() as f32 / intspan_0.cardinality() as f32 > 0.5
&& intersect.cardinality() as f32 / intspan_1.cardinality() as f32 > 0.5
{
lines = lines
.iter()
.filter(|key| *key != line)
.map(String::to_string)
.collect::<Vec<String>>();
}
}
}
write_lines(args.get_one::<String>("outfile").unwrap(), &lines)?;
Ok(())
}