twitcher 0.1.8

Find template switch mutations in genomic data
use std::{collections::HashMap, mem};
use tokio::io::AsyncBufReadExt;

use tracing::{debug, error};

use crate::common::{contig::ContigName, coords::GenomeRegion};

#[derive(Clone, Debug, Default)]
pub struct Regions(HashMap<ContigName, Vec<GenomeRegion>>);

impl Regions {
    // Sorts by chr, as specified by argument closure and then concats all
    pub fn into_linear<F: for<'a> Fn(&'a ContigName) -> usize>(
        mut self,
        cmp: F,
    ) -> Vec<GenomeRegion> {
        let mut keys: Vec<_> = self.0.keys().cloned().collect();
        keys.sort_by_key(cmp);

        keys.into_iter()
            .flat_map(move |key| self.0.remove(&key).unwrap())
            .collect()
    }

    pub fn contains(&self, region: &GenomeRegion) -> bool {
        let Some(list) = self.0.get(region.contig()) else {
            return false;
        };

        // the regions are non-overlapping (by merging them), so we can search by start position

        let ix = match list
            .binary_search_by_key(&region.start().position_0(), |r| r.start().position_0())
        {
            Ok(ix) => Some(ix),
            Err(0) => None,
            Err(ix) => Some(ix - 1),
        };

        ix.is_some_and(|ix| list[ix].contains(region))
    }
}

pub(crate) trait RegionsDefinition {
    async fn read_regions(&self) -> anyhow::Result<Option<Regions>>;
}

#[derive(clap::Args, Clone, Debug, Default)]
pub struct CliRegionsArgs {
    /// A list of regions to process. Each region shall be given in the format `chr1:100-200` or `chr1`. The reader will use an index to jump to the sepcified regions in the VCF file.
    #[arg(short = 'r', long = "regions", value_name = "LIST")]
    pub r_list: Vec<String>,

    /// Same as -r, but the list of regions is specified in a file. Each line corresponds to a region; each region may be either be a BED-like tab-separated definition, or a "position" in the form of `chr1:100-200`
    #[arg(short = 'R', long = "regions-file", value_name = "FILE")]
    pub r_file: Option<String>,
}

impl RegionsDefinition for CliRegionsArgs {
    async fn read_regions(&self) -> anyhow::Result<Option<Regions>> {
        read_regions(&self.r_list, self.r_file.as_ref()).await
    }
}

#[derive(clap::Args, Clone, Debug, Default)]
#[group(multiple = false)]
pub struct CliTargetsArgs {
    /// A list of targets. Format is the same as for -r, but the reader will stream instead of jumping to the sepcified regions. This may be more performant if the list of targets is long.
    #[arg(short = 't', long = "targets", value_name = "LIST")]
    pub t_list: Vec<String>,

    /// Same as -t, but the list of targets is specified in a file. See also -R.
    #[arg(short = 'T', long = "targets-file", value_name = "FILE")]
    pub t_file: Option<String>,
}

impl RegionsDefinition for CliTargetsArgs {
    async fn read_regions(&self) -> anyhow::Result<Option<Regions>> {
        read_regions(&self.t_list, self.t_file.as_ref()).await
    }
}

async fn read_regions(
    list: &[String],
    file: Option<&String>,
) -> Result<Option<Regions>, anyhow::Error> {
    let mut result: Option<Regions> = None;

    for reg in list {
        add_region(result.get_or_insert_default(), reg);
    }

    if let Some(path) = file {
        let fr = tokio::fs::File::open(path).await?;
        let mut br = tokio::io::BufReader::new(fr).lines();
        while let Some(reg) = br.next_line().await? {
            if reg.is_empty() {
                continue;
            }
            add_region(result.get_or_insert_default(), &reg);
        }
    }

    if let Some(ref mut regions) = result {
        for list in regions.0.values_mut() {
            list.sort_by(|a, b| a.partial_cmp(b).unwrap()); // they should all, by definition have the same contig, i.e. be comparable
            merge(list);
        }
    }

    debug!("List: {result:?}");

    Ok(result)
}

fn merge(list: &mut Vec<GenomeRegion>) {
    let mut reduced_list = Vec::with_capacity(list.len());

    let mut iter = list.iter().cloned();
    if let Some(mut current) = iter.next() {
        for region in iter {
            if let Some(merged) = current.merge(&region) {
                // They were mergeable; keep growing the current region
                current = merged;
            } else {
                // Not mergeable; finalize current and move on
                reduced_list.push(current);
                current = region;
            }
        }
        reduced_list.push(current);
    }

    reduced_list.shrink_to_fit();
    mem::swap(&mut reduced_list, list);
}

fn add_region(list: &mut Regions, region: &str) {
    match GenomeRegion::parse(region.as_bytes()) {
        Ok(reg) => list.0.entry(reg.contig().clone()).or_default().push(reg),
        Err(err) => {
            error!("Cannot parse region '{region}': {err}");
        }
    }
}