klassify 0.1.6

Classify chimeric reads based on unique kmer contents
Documentation
use crate::tools::depth::{get_runner, BedRecord};
use crate::utils::{index_bam, need_update, prefix_until_dot, BINSIZE, CHAIN_DISTANCE};
use clap::Parser;
use log::info;
use perbase_lib::utils::{get_reader, get_writer};
use std::collections::{BTreeMap, HashMap};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;

/// Minimum read support for a candidate region
const MIN_READ_SUPPORT: u32 = 5;

/// Maximum read support for a candidate region
const MAX_READ_SUPPORT: u32 = 100;

#[derive(Parser, Debug)]
#[command(arg_required_else_help(true))]
pub struct RegionsArgs {
    /// BAM files
    pub bam_files: Vec<String>,
    /// Do not limit chimeras between chromosomes only, e.g., must contain "Chr" and "chr"
    #[clap(short, long, default_value_t = false)]
    pub no_chr_only: bool,
    /// Minimum depth support for a region
    #[clap(long, default_value_t = MIN_READ_SUPPORT)]
    pub min_support: u32,
    /// Maximum depth support for a region
    #[clap(long, default_value_t = MAX_READ_SUPPORT)]
    pub max_support: u32,
}

/// Prepare BAM files and generate depths for each bin
pub fn regions(bam_files: &Vec<String>, chr_only: bool, min_support: u32, max_support: u32) {
    let mut bed_files = Vec::new();
    for bam_file in bam_files {
        let bed_file = if bam_file.ends_with(".bam") {
            regions_one(bam_file, BINSIZE)
        } else {
            bam_file.clone()
        };
        bed_files.push(bed_file);
    }

    // Perform the depth analysis
    process_bedfiles(bed_files, chr_only, min_support, max_support);
}

/// Prepare one BAM file and generate depths for each bin
fn regions_one(bam_file: &str, bin_size: u32) -> String {
    // Check if BAM index exists
    index_bam(bam_file);

    // Output file compatible with the .regions.bed.gz
    let out_prefix = prefix_until_dot(bam_file);
    let bed_path = format!("{out_prefix}.regions.bed.gz");
    if need_update(&[bam_file], &[&bed_path], true) {
        let runner = get_runner(bam_file, bin_size);
        let rx = runner.process().unwrap(); // multithreaded iteration
        let mut writer = get_writer(&Some(&bed_path), true, false, 0, 6).unwrap();

        rx.into_iter()
            .try_for_each(|d| -> Result<(), csv::Error> {
                // BED is 0-based, half-open
                writer
                    .write_record([
                        d.chrom.as_str(),
                        d.start.to_string().as_str(),
                        d.end.to_string().as_str(),
                        d.depth.to_string().as_str(),
                    ])
                    .expect("Failed to write BED record");
                Ok(())
            })
            .expect("Failed to write BED record");
    }
    info!("Generated depth `{bed_path}` for `{bam_file}`");
    bed_path
}

/// Load a BED file into a bunch of records
fn load_bed(bed: &str) -> Vec<BedRecord> {
    let mut reader = get_reader(&Some(bed), false, true).expect("Failed to open BED file");
    reader.deserialize().map(Result::unwrap).collect()
}

/// Process F1 and parent BED files to generate candidate regions.
fn process_bedfiles(
    bed_files: Vec<String>,
    chr_only: bool,
    min_support: u32,
    max_support: u32,
) -> HashMap<String, i32> {
    let child_bed = &bed_files[0];
    let parent1_bed = &bed_files[1];
    let parent2_bed = if bed_files.len() == 3 {
        Some(bed_files[2].clone())
    } else {
        None
    };

    let child_records = load_bed(child_bed);
    let parent1_records = load_bed(parent1_bed);
    let parent2_records = parent2_bed.map(|bed| load_bed(&bed));

    let mut regions: BTreeMap<String, Vec<(u32, u32, f64)>> = BTreeMap::new();

    for (i, child_record) in child_records.iter().enumerate() {
        let child_depth = child_record.depth;
        let parent1_depth = parent1_records[i].depth;
        let parent2_depth = if let Some(ref parent2_recs) = parent2_records {
            parent2_recs[i].depth
        } else {
            0.0
        };

        let depth_ratio = child_depth / (parent1_depth + parent2_depth + 1.0);

        regions
            .entry(child_record.chrom.clone())
            .or_default()
            .push((child_record.start, child_record.end, depth_ratio));
    }

    let mut d = Vec::new();
    let mut selected = Vec::new();
    let support_range = min_support as f64..=max_support as f64;

    for (chrom, data) in &mut regions {
        if chr_only && !chrom.contains("Chr") && !chrom.contains("chr") {
            continue;
        }

        data.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());

        let chrom_selected: Vec<_> = data
            .iter()
            .filter(|&&(_, _, depth)| support_range.contains(&depth))
            .map(|&(start, end, depth)| (chrom.clone(), start, end, format!("{}", depth.round())))
            .collect();

        selected.extend_from_slice(&chrom_selected);
        let regions_str = chrom_selected
            .iter()
            .map(|(chrom, start, end, depth)| {
                format!("{}:{}-{}:{}", chrom, start, end, depth.clone())
            })
            .collect::<Vec<_>>()
            .join(",");

        d.push((chrom.clone(), regions_str));
    }

    let prefix = Path::new(child_bed).file_stem().unwrap().to_str().unwrap();
    let pf = prefix_until_dot(prefix);
    let poi_tsv = format!("{pf}.poi.tsv");

    let mut kf_writer = csv::WriterBuilder::new()
        .delimiter(b'\t')
        .from_path(&poi_tsv)
        .unwrap();

    kf_writer.write_record(["Chrom", "Regions"]).unwrap();

    for (chrom, regions_str) in d {
        kf_writer.write_record(&[chrom, regions_str]).unwrap();
    }

    info!("Points of interests written to `{}`", poi_tsv);

    // Merge regions that are close to each other
    selected.sort_by_key(|k| (k.0.clone(), k.1));

    let mut merged = Vec::new();

    for (i, s) in selected.iter().enumerate() {
        if i == 0 {
            merged.push(s.clone());
            continue;
        }

        let prev = merged.last_mut().unwrap();
        let cur = s;

        if prev.0 == cur.0 && prev.2 + CHAIN_DISTANCE >= cur.1 {
            prev.2 = prev.2.max(cur.2);
            prev.3 = format!("{},{}", prev.3, cur.3);
        } else {
            merged.push(cur.clone());
        }
    }

    // Write the merged regions to a file
    let regions_file = format!("{pf}.regions.tsv");
    let mut counter = HashMap::new();

    let mut regions_writer = BufWriter::new(File::create(&regions_file).unwrap());

    for (chrom, start, end, score) in &merged {
        writeln!(regions_writer, "{chrom}:{start}-{end}\t{score}").unwrap();
        *counter.entry(chrom[..2].to_string()).or_insert(0) += 1;
    }

    info!("Merged regions written to `{}`", regions_file);
    info!("Region counts: {:?}", counter);
    counter
}