use clap::*;
use intspan::*;
use petgraph::prelude::NodeIndex;
use petgraph::*;
use std::cmp;
use std::collections::{BTreeSet, HashMap};
use std::io::BufRead;
pub fn make_subcommand() -> Command {
Command::new("connect")
.about("Connect bilateral links into multilateral ones")
.after_help(
r###"
* <infiles> are bilateral link files without hit strands
"###,
)
.arg(
Arg::new("infiles")
.required(true)
.num_args(1..)
.index(1)
.help("Set the input files to use"),
)
.arg(
Arg::new("ratio")
.long("ratio")
.short('r')
.num_args(1)
.default_value("0.9")
.value_parser(value_parser!(f32))
.help("Break links if length identities less than this ratio"),
)
.arg(
Arg::new("verbose")
.long("verbose")
.short('v')
.action(ArgAction::SetTrue)
.help("Verbose mode"),
)
.arg(
Arg::new("outfile")
.long("outfile")
.short('o')
.num_args(1)
.default_value("stdout")
.help("Output filename. [stdout] for screen"),
)
}
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let ratio = *args.get_one::<f32>("ratio").unwrap();
let is_verbose = args.get_flag("verbose");
let mut graph: Graph<String, String, Undirected, u32> = Graph::new_undirected();
let mut range_of_part: HashMap<String, Range> = HashMap::new();
let mut idx_of_part: HashMap<String, NodeIndex> = HashMap::new();
for infile in args.get_many::<String>("infiles").unwrap() {
if is_verbose {
eprintln!("==> Loading {:#?}", infile);
}
let reader = reader(infile);
for line in reader.lines().map_while(Result::ok) {
build_range_of_part(&line, &mut range_of_part);
let mut parts: Vec<String> = line
.split('\t')
.map(String::from)
.filter(|part| range_of_part.contains_key(part))
.collect();
let count = parts.len();
if count != 2 {
continue;
}
let mut strands: BTreeSet<String> = BTreeSet::new();
for i in 0..=1 {
let mut range = range_of_part[&parts[i]].clone();
strands.insert(range.strand().to_string());
*range.strand_mut() = "+".to_string();
parts[i] = range.to_string();
build_range_of_part(&parts[i], &mut range_of_part);
}
let hit_strand = if strands.len() == 1 {
"+".to_string()
} else {
"-".to_string()
};
for part in &parts {
if !idx_of_part.contains_key(part) {
let idx = graph.add_node(part.to_string());
idx_of_part.insert(part.to_string(), idx);
if is_verbose {
eprintln!("Add range {} as {:#?}", part, idx);
}
}
}
if graph
.find_edge(idx_of_part[&parts[0]], idx_of_part[&parts[1]])
.is_some()
{
if is_verbose {
eprintln!("Edge exists, next");
}
} else {
let edge =
graph.add_edge(idx_of_part[&parts[0]], idx_of_part[&parts[1]], hit_strand);
if is_verbose {
eprintln!(" Add edge {}\t{} as {:#?}", parts[0], parts[1], edge);
}
}
} }
if is_verbose {
eprintln!("==> Find connected components");
}
let mut cc_lines: Vec<String> = Vec::new();
let scc: Vec<Vec<NodeIndex>> = petgraph::algo::tarjan_scc(&graph);
for cc_indices in &scc {
if cc_indices.len() < 2 {
continue;
}
let part_list = cc_indices
.iter()
.map(|idx| graph.node_weight(*idx).unwrap().clone())
.collect::<Vec<String>>();
cc_lines.push(part_list.join("\t"));
}
cc_lines = sort_links(&cc_lines);
if is_verbose {
eprintln!("==> Total {:#?} connected components", cc_lines.len());
}
if is_verbose {
eprintln!("==> Change strands");
}
let mut new_graph: Graph<String, (), Undirected, u32> = Graph::new_undirected();
let mut new_idx_of_part: HashMap<String, NodeIndex> = HashMap::new();
for line in &cc_lines {
let parts: Vec<String> = line.split('\t').map(String::from).collect();
let count = parts.len();
if is_verbose {
eprintln!("Copy number of this cc is {}", count);
}
let mut ranges = parts
.iter()
.map(|part| {
let mut range = Range::from_str(part);
*range.strand_mut() = "".to_string();
range
})
.collect::<Vec<Range>>();
*ranges[0].strand_mut() = "+".to_string();
let mut assigned: BTreeSet<usize> = BTreeSet::new();
assigned.insert(0);
let mut unhandled: BTreeSet<usize> = (1..count).collect();
let mut edges = vec![];
while assigned.len() < count {
for i in assigned.iter().cloned().collect::<Vec<usize>>() {
for j in unhandled.iter().cloned().collect::<Vec<usize>>() {
let edge = graph.find_edge(idx_of_part[&parts[i]], idx_of_part[&parts[j]]);
if edge.is_none() {
continue;
}
let hit_strand = graph.edge_weight(edge.unwrap()).unwrap();
if hit_strand == "-" {
if is_verbose {
eprint!(
" Based on {}, change strand from {}",
ranges[i], ranges[j]
);
}
if ranges[i].strand() == "-" {
*ranges[j].strand_mut() = "+".to_string();
} else {
*ranges[j].strand_mut() = "-".to_string();
}
if is_verbose {
eprintln!(" to {}", ranges[j]);
}
} else {
*ranges[j].strand_mut() = ranges[i].strand().to_string();
}
let j_copy = unhandled.take(&j).unwrap();
assigned.insert(j_copy);
let size_i = ranges[i].intspan().cardinality();
let size_j = ranges[j].intspan().cardinality();
let size_min = cmp::min(size_i, size_j);
let size_max = cmp::max(size_i, size_j);
let diff_ratio = size_min as f32 / size_max as f32;
if diff_ratio < ratio {
if is_verbose {
eprintln!(" Break edge between {}\t{}", parts[i], parts[j]);
eprintln!(
" Ratio[{}]\tMin [{}]\tMax[{}]",
diff_ratio, size_min, size_max
);
}
} else {
edges.push((i, j));
}
}
}
}
for (i, j) in edges {
let part_i = ranges[i].to_string();
let part_j = ranges[j].to_string();
for part in &[part_i.to_string(), part_j.to_string()] {
if !new_idx_of_part.contains_key(part) {
let idx = new_graph.add_node(part.to_string());
new_idx_of_part.insert(part.to_string(), idx);
}
}
new_graph.add_edge(new_idx_of_part[&part_i], new_idx_of_part[&part_j], ());
}
}
if is_verbose {
eprintln!("==> Recreate connected components");
}
let mut out_lines: Vec<String> = Vec::new();
let scc: Vec<Vec<NodeIndex>> = petgraph::algo::tarjan_scc(&new_graph);
for cc_indices in &scc {
if cc_indices.len() < 2 {
continue;
}
let part_list = cc_indices
.iter()
.map(|idx| new_graph.node_weight(*idx).unwrap().clone())
.collect::<Vec<String>>();
out_lines.push(part_list.join("\t"));
}
out_lines = sort_links(&out_lines);
write_lines(args.get_one::<String>("outfile").unwrap(), &out_lines)?;
Ok(())
}