intspan 0.8.7

Command line tools for IntSpan related bioinformatics operations
Documentation
use clap::*;
use intspan::*;
use petgraph::prelude::NodeIndex;
use petgraph::*;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::io::BufRead;

// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
    Command::new("clean")
        .about("Replace ranges within links, incorporate hit strands and remove nested links")
        .after_help(
            r###"
* <infiles> are bilateral links files, with or without hit strands

"###,
        )
        .arg(
            Arg::new("infiles")
                .required(true)
                .num_args(1..)
                .index(1)
                .help("Set the input files to use"),
        )
        .arg(
            Arg::new("replace")
                .long("replace")
                .short('r')
                .num_args(1)
                .help("Two-column tsv file, normally produced by command merge"),
        )
        .arg(
            Arg::new("bundle")
                .long("bundle")
                .short('b')
                .num_args(1)
                .value_parser(value_parser!(i32))
                .default_value("0")
                .help("Bundle overlapped links. This value is the overlapping size. Suggested value is [500]"),
        )
        .arg(
            Arg::new("verbose")
                .long("verbose")
                .short('v')
                .action(ArgAction::SetTrue)
                .help("Verbose mode"),
        )
        .arg(
            Arg::new("outfile")
                .long("outfile")
                .short('o')
                .num_args(1)
                .default_value("stdout")
                .help("Output filename. [stdout] for screen"),
        )
}

// command implementation
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
    //----------------------------
    // Loading
    //----------------------------
    let bundle = *args.get_one::<i32>("bundle").unwrap();
    let is_verbose = args.get_flag("verbose");

    // cache ranges
    let mut range_of_part: HashMap<String, Range> = HashMap::new();

    //----------------------------
    // Load replaces
    //----------------------------
    let mut replaces: HashMap<String, String> = HashMap::new();
    if args.contains_id("replace") {
        if is_verbose {
            eprintln!("==> Load replaces");
        }
        for line in read_lines(args.get_one::<String>("replace").unwrap()) {
            build_range_of_part(&line, &mut range_of_part);

            let parts: Vec<&str> = line.split('\t').collect();
            if parts.len() == 2 {
                replaces.insert(parts[0].to_string(), parts[1].to_string());
            }
        }
    }

    //----------------------------
    // Replacing and incorporating strands
    //----------------------------
    if is_verbose {
        eprintln!("==> Replacing and incorporating strands");
    }

    let mut line_set: BTreeSet<String> = BTreeSet::new();
    for infile in args.get_many::<String>("infiles").unwrap() {
        let reader = reader(infile);
        for line in reader.lines().map_while(Result::ok) {
            build_range_of_part(&line, &mut range_of_part);

            let mut parts: Vec<String> = line.split('\t').map(String::from).collect();
            let count = parts.len();

            // make sure that all lines are bilateral links
            if !(count == 2 || count == 3) {
                continue;
            }
            if !range_of_part.contains_key(&parts[0]) {
                continue;
            }
            if !range_of_part.contains_key(&parts[1]) {
                continue;
            }

            // replacing
            for i in 0..count {
                let original = parts[i].to_string();
                if replaces.contains_key(&original) {
                    // create new range, use original strand
                    let mut new_range = range_of_part[&replaces[&original]].clone();
                    *new_range.strand_mut() = range_of_part[&original].strand().to_string();
                    let new_part = new_range.to_string();

                    build_range_of_part(&new_part, &mut range_of_part);
                    *parts.get_mut(i).unwrap() = new_part;
                }
            }

            // incorporating strands
            let mut strands: BTreeSet<String> = BTreeSet::new();
            if count == 3 && (parts[2] == "+" || parts[2] == "-") {
                strands.insert(parts[2].to_string());
            }

            for i in &[0, 1] {
                strands.insert(range_of_part[&parts[*i as usize]].strand().to_string());
            }
            //            eprintln!("strands = {:#?}", strands);

            let mut range_0 = range_of_part[&parts[0]].clone();
            let mut range_1 = range_of_part[&parts[1]].clone();

            // skip identical ranges
            if range_0.chr() == range_1.chr()
                && range_0.start() == range_1.start()
                && range_0.end() == range_1.end()
            {
                continue;
            }

            if strands.len() == 1 {
                *range_0.strand_mut() = "+".to_string();
                *range_1.strand_mut() = "+".to_string();
            } else {
                *range_0.strand_mut() = "+".to_string();
                *range_1.strand_mut() = "-".to_string();
            }

            let new_line = format!("{}\t{}", range_0, range_1);
            build_range_of_part(&new_line, &mut range_of_part);
            line_set.insert(new_line);
        }
    }

    //----------------------------
    // Remove nested links
    //----------------------------
    // now all lines (links) are without hit strands
    let mut lines = line_set
        .iter()
        .map(String::to_string)
        .collect::<Vec<String>>();
    lines = sort_links(&lines);
    let mut is_nested = true;
    while is_nested {
        if is_verbose {
            eprintln!("==> Remove nested links");
        }

        let mut to_remove: HashSet<String> = HashSet::new();
        let chr_pairs = lines
            .iter()
            .map(|line| {
                let parts: Vec<&str> = line.split('\t').collect();
                format!(
                    "{}:{}",
                    range_of_part[parts[0]].chr(),
                    range_of_part[parts[1]].chr()
                )
            })
            .collect::<Vec<String>>();

        for i in 0..lines.len() {
            let cur_pair = &chr_pairs[i];
            let rest_idx: Vec<usize> = (i + 1..lines.len())
                .filter(|key| chr_pairs[*key] == *cur_pair)
                .collect();

            for j in rest_idx {
                let line_i = &lines[i];
                let parts_i: Vec<&str> = line_i.split('\t').collect();

                let line_j = &lines[j];
                let parts_j: Vec<&str> = line_j.split('\t').collect();

                let intspan_0_i = range_of_part[parts_i[0]].intspan();
                let intspan_1_i = range_of_part[parts_i[1]].intspan();

                let intspan_0_j = range_of_part[parts_j[0]].intspan();
                let intspan_1_j = range_of_part[parts_j[1]].intspan();

                if intspan_0_i.superset(&intspan_0_j) && intspan_1_i.superset(&intspan_1_j) {
                    to_remove.insert(line_j.to_string());
                } else if intspan_0_j.superset(&intspan_0_i) && intspan_1_j.superset(&intspan_1_i) {
                    to_remove.insert(line_i.to_string());
                }
            }
        }

        lines = lines
            .iter()
            .filter(|key| !to_remove.contains(*key))
            .map(String::to_string)
            .collect::<Vec<String>>();
        is_nested = !to_remove.is_empty();
    }
    lines = sort_links(&lines);

    //----------------------------
    // Bundle links
    //----------------------------
    if bundle != 0 {
        if is_verbose {
            eprintln!("==> Bundle overlapped links");
        }

        let chr_strand_pairs = lines
            .iter()
            .map(|line| {
                let parts: Vec<&str> = line.split('\t').collect();
                format!(
                    "{}:{}:{}:{}",
                    range_of_part[parts[0]].chr(),
                    range_of_part[parts[0]].strand(),
                    range_of_part[parts[1]].chr(),
                    range_of_part[parts[1]].strand(),
                )
            })
            .collect::<Vec<String>>();

        let mut graph: Graph<String, (), Undirected> = Graph::new_undirected();
        // cache node indices
        let mut idx_of_line: HashMap<String, NodeIndex> = HashMap::new();

        for i in 0..lines.len() {
            let cur_pair = &chr_strand_pairs[i];
            let rest_idx: Vec<usize> = (i + 1..lines.len())
                .filter(|key| chr_strand_pairs[*key] == *cur_pair)
                .collect();

            for j in rest_idx {
                let line_i = &lines[i];
                let parts_i: Vec<&str> = line_i.split('\t').collect();

                let line_j = &lines[j];
                let parts_j: Vec<&str> = line_j.split('\t').collect();

                if !idx_of_line.contains_key(line_i) {
                    let idx = graph.add_node(line_i.to_string());
                    idx_of_line.insert(line_i.to_string(), idx);
                }
                if !idx_of_line.contains_key(line_j) {
                    let idx = graph.add_node(line_j.to_string());
                    idx_of_line.insert(line_j.to_string(), idx);
                }

                let intspan_0_i = range_of_part[parts_i[0]].intspan();
                let intspan_1_i = range_of_part[parts_i[1]].intspan();

                let intspan_0_j = range_of_part[parts_j[0]].intspan();
                let intspan_1_j = range_of_part[parts_j[1]].intspan();

                if intspan_0_i.intersect(&intspan_0_j).cardinality() >= bundle
                    && intspan_1_i.intersect(&intspan_1_j).cardinality() >= bundle
                {
                    graph.add_edge(idx_of_line[line_i], idx_of_line[line_j], ());
                }
            }
        }

        // bundle connected lines
        let scc: Vec<Vec<NodeIndex>> = petgraph::algo::tarjan_scc(&graph);
        for connected_indices in &scc {
            if connected_indices.len() < 2 {
                continue;
            }

            if is_verbose {
                eprintln!("Merge {} lines", connected_indices.len());
            }

            // connected lines
            let mut line_list = connected_indices
                .iter()
                .map(|idx| graph.node_weight(*idx).unwrap().clone())
                .collect::<Vec<String>>();
            line_list.sort();
            if is_verbose {
                eprintln!("line_list = {:#?}", line_list);
            }

            let mut merged_ranges: Vec<String> = Vec::new();
            for i in 0..=1 {
                let mut chr = "".to_string();
                let mut strand = "".to_string();
                let mut intspan = IntSpan::new();

                for line in &line_list {
                    // remove lines to be merged
                    lines = lines
                        .iter()
                        .filter(|key| *key != line)
                        .map(String::to_string)
                        .collect::<Vec<String>>();

                    let parts: Vec<&str> = line.split('\t').collect();
                    let range = range_of_part.get(parts[i]).unwrap();
                    chr = range.chr().to_string();
                    strand = range.strand().to_string();
                    intspan.merge(&range.intspan());
                }

                let merged: String =
                    format!("{}({}):{}-{}", chr, strand, intspan.min(), intspan.max());
                merged_ranges.push(merged);
            }

            let new_line = merged_ranges.join("\t");
            build_range_of_part(&new_line, &mut range_of_part);
            if is_verbose {
                eprintln!("    To {}", new_line);
            }
            lines.push(new_line);
        }

        lines = sort_links(&lines);
    }

    //----------------------------
    // Links of nearly identical ranges escaped from merging
    //----------------------------
    if args.contains_id("replace") {
        if is_verbose {
            eprintln!("==> Remove self links");
        }

        let same_chr_lines = lines
            .iter()
            .filter(|line| {
                let parts: Vec<&str> = line.split('\t').collect();
                range_of_part[parts[0]].chr() == range_of_part[parts[1]].chr()
            })
            .map(String::to_string)
            .collect::<Vec<String>>();

        for line in &same_chr_lines {
            let parts: Vec<&str> = line.split('\t').collect();

            let intspan_0 = range_of_part[parts[0]].intspan();
            let intspan_1 = range_of_part[parts[1]].intspan();

            let intersect = intspan_0.intersect(&intspan_1);
            if !intersect.is_empty()
                && intersect.cardinality() as f32 / intspan_0.cardinality() as f32 > 0.5
                && intersect.cardinality() as f32 / intspan_1.cardinality() as f32 > 0.5
            {
                lines = lines
                    .iter()
                    .filter(|key| *key != line)
                    .map(String::to_string)
                    .collect::<Vec<String>>();
            }
        }
    }

    //----------------------------
    // Output
    //----------------------------
    write_lines(args.get_one::<String>("outfile").unwrap(), &lines)?;

    Ok(())
}