bbnorm-rs 0.1.3

use crate::cli::Config;
use crate::seqio::create_output;
use anyhow::{Context, Result};
use std::io::Write;
use std::path::Path;

const MAX_WIDTH_MULT: f64 = 2.5;
const PROGRESSIVE_MULT: f64 = 2.0;
const MAX_RADIUS: i32 = 10;

#[derive(Clone, Debug)]
struct Peak {
    start: usize,
    center: usize,
    stop: usize,
    max_pos: usize,
    max_height: u64,
    start_height: u64,
    stop_height: u64,
    left_min: u64,
    right_min: u64,
    volume: u64,
    volume2: u64,
}

impl Peak {
    #[allow(clippy::too_many_arguments)]
    fn new(
        center: usize,
        start: usize,
        stop: usize,
        max_pos: usize,
        max_height: u64,
        start_height: u64,
        stop_height: u64,
        left_min: u64,
        right_min: u64,
        volume: u64,
        volume2: u64,
    ) -> Self {
        Self {
            start,
            center,
            stop,
            max_pos,
            max_height,
            start_height,
            stop_height,
            left_min,
            right_min,
            volume,
            volume2,
        }
    }

    fn recalculate(&mut self, array: &[u64]) {
        self.max_height = array[self.center];
        self.start_height = array[self.start];
        self.stop_height = array[self.stop];
        self.left_min = self.start_height;
        self.right_min = self.stop_height;
        self.max_pos = self.center;
        self.volume = 0;
        self.volume2 = 0;

        for (i, &x) in array.iter().enumerate().take(self.stop).skip(self.start) {
            if x > self.max_height {
                self.max_pos = i;
                self.max_height = x;
            }
            if i < self.center {
                self.left_min = self.left_min.min(x);
            } else if i > self.center {
                self.right_min = self.right_min.min(x);
            }
            self.volume = self.volume.saturating_add(x);
            self.volume2 = self.volume2.saturating_add(x.saturating_mul(i as u64));
        }
    }

    fn compatible_with(&self, other: &Self) -> bool {
        let min = self.center.min(other.stop) as f64;
        let max = self.stop.max(other.center) as f64;
        min * MAX_WIDTH_MULT >= max
    }

    fn absorb(&mut self, other: &Self) {
        if self.center > other.center {
            if self.start > other.start {
                self.start = other.start;
                self.start_height = other.start_height;
            }
            self.left_min = self.left_min.min(other.left_min);
        } else {
            if self.stop < other.stop {
                self.stop = other.stop;
                self.stop_height = other.stop_height;
            }
            self.right_min = self.right_min.min(other.right_min);
        }

        if self.max_height < other.max_height {
            self.max_height = other.max_height;
            self.max_pos = other.max_pos;
        }
        self.volume = self.volume.saturating_add(other.volume);
        self.volume2 = self.volume2.saturating_add(other.volume2);
    }
}

pub(crate) fn write_peaks(path: &Path, raw_hist: &[u64], config: &Config) -> Result<()> {
    let unique_hist: Vec<u64> = raw_hist
        .iter()
        .copied()
        .enumerate()
        .map(|(depth, raw)| unique_from_raw(depth, raw))
        .collect();
    let peaks = call_peaks(&unique_hist, config);
    let unique_kmers: u64 = unique_hist.iter().sum();

    let mut writer = create_output(path, config.overwrite || config.append)
        .with_context(|| format!("creating peaks {}", path.display()))?;
    print_peaks(&mut writer, &peaks, config, unique_kmers, &unique_hist)?;
    writer.flush()?;
    Ok(())
}

fn call_peaks(original: &[u64], config: &Config) -> Vec<Peak> {
    let array = smooth_progressive(original, 1);
    let length = original.len();
    let Some(dip0) = (1..length).find_map(|i| (array[i - 1] < array[i]).then_some(i - 1)) else {
        return Vec::new();
    };

    let mut peaks = Vec::new();
    let mut mode_up = true;
    let mut start = dip0;
    let mut center = 0usize;
    let mut prev = array[dip0];
    let mut sum = prev;
    let mut sum2 = prev.saturating_mul(dip0 as u64);
    let mut i = dip0 + 1;

    while i < length {
        let x = array[i];
        if mode_up {
            if x < prev {
                mode_up = false;
                center = i - 1;
            }
        } else if x > prev {
            let mut stop = i - 1;
            let max = array[center];
            if peak_passes_filters(center, start, stop, max, sum, config) {
                center = middle_of_mesa(&array, center, max);
                stop = middle_of_valley(&array, stop);
                let height1 = array[start];
                let height2 = array[stop];
                peaks.push(Peak::new(
                    center, start, stop, center, max, height1, height2, height1, height2, sum, sum2,
                ));
            }
            start = stop;
            sum = 0;
            sum2 = 0;
            if i > config.peak_max_peak {
                break;
            }
            while i < array.len() && array[i] == 0 {
                i += 1;
            }
            mode_up = true;
        }

        sum = sum.saturating_add(x);
        sum2 = sum2.saturating_add(x.saturating_mul(i as u64));
        prev = x;
        i += 1;
    }

    if !mode_up {
        let mut stop = length;
        let max = array[center];
        center = middle_of_mesa(&array, center, max);
        let valley = array[stop - 1];
        for j in (0..stop).rev() {
            if array[j] != valley {
                stop = if valley == 0 {
                    j + 1
                } else {
                    (stop - 1 + j + 2) / 2
                };
                break;
            }
        }
        let clamped_stop = stop.min(length - 1);
        if peak_passes_filters(center, start, stop, max, sum, config) {
            let height1 = array[start];
            let height2 = array[clamped_stop];
            peaks.push(Peak::new(
                center,
                start,
                clamped_stop,
                center,
                max,
                height1,
                height2,
                height1,
                height2,
                sum,
                sum2,
            ));
        }
    }

    cap_width(&mut peaks, &array);
    if config.peak_max_count < peaks.len() {
        peaks = condense(&peaks, config.peak_max_count);
    }
    cap_width(&mut peaks, &array);

    if peaks.len() > 1 {
        let biggest_volume = peaks[biggest_peak(&peaks)].volume as f64;
        while peaks.len() > 1 && (peaks[0].volume as f64) < 0.0001 * biggest_volume {
            peaks.remove(0);
        }
    }

    for peak in &mut peaks {
        peak.recalculate(original);
    }
    peaks
        .into_iter()
        .filter(|peak| peak.volume >= config.peak_min_volume)
        .collect()
}

fn peak_passes_filters(
    center: usize,
    start: usize,
    stop: usize,
    max: u64,
    sum: u64,
    config: &Config,
) -> bool {
    center >= config.peak_min_peak
        && center <= config.peak_max_peak
        && max >= config.peak_min_height
        && stop.saturating_sub(start) >= config.peak_min_width
        && sum >= config.peak_min_volume
}

fn middle_of_mesa(array: &[u64], center: usize, max: u64) -> usize {
    for j in (0..center).rev() {
        if array[j] != max {
            return (center + j + 2) / 2;
        }
    }
    center
}

fn middle_of_valley(array: &[u64], stop: usize) -> usize {
    let valley = array[stop];
    for j in (0..=stop).rev() {
        if array[j] != valley {
            return if valley == 0 {
                j + 1
            } else {
                (stop + j + 2) / 2
            };
        }
    }
    stop
}

fn cap_width(peaks: &mut [Peak], counts: &[u64]) {
    let mult = 1.0 / MAX_WIDTH_MULT;
    for peak in peaks {
        peak.start = java_round_f64((peak.start as f64).max(peak.center as f64 * mult)) as usize;
        peak.stop =
            java_round_f64((peak.stop as f64).min(peak.center as f64 * MAX_WIDTH_MULT)) as usize;
        peak.stop = peak.stop.min(counts.len().saturating_sub(1));
        peak.start = peak.start.min(peak.center);
        peak.recalculate(counts);
    }
}

fn condense(input: &[Peak], max_count: usize) -> Vec<Peak> {
    if input.is_empty() {
        return Vec::new();
    }
    let max_count = max_count.min(input.len()).max(1);
    let mut heights: Vec<u64> = input.iter().map(|peak| peak.max_height).collect();
    heights.sort_unstable();
    let hlimit = heights[heights.len() - max_count];

    let mc2 = max_count.div_ceil(2);
    let mut volumes: Vec<u64> = input.iter().map(|peak| peak.volume).collect();
    volumes.sort_unstable();
    let vlimit = volumes[volumes.len() - mc2];

    let mut output = Vec::with_capacity(max_count.min(input.len()));
    for peak in input {
        if peak.volume >= vlimit || peak.max_height >= hlimit {
            output.push(peak.clone());
        }
    }

    for peak in input {
        if peak.volume < vlimit && peak.max_height < hlimit {
            let Some((closest_idx, _)) = output
                .iter()
                .enumerate()
                .min_by_key(|(_, kept)| peak.center.abs_diff(kept.center))
            else {
                continue;
            };
            if output[closest_idx].compatible_with(peak) {
                output[closest_idx].absorb(peak);
            }
        }
    }

    output
}

fn print_peaks(
    writer: &mut Box<dyn Write>,
    peaks: &[Peak],
    config: &Config,
    unique_kmers: u64,
    hist: &[u64],
) -> Result<()> {
    if !peaks.is_empty() {
        let min_volume_fraction = single_copy_kmer_fraction(0.0003, config.k, 2).min(1.0);
        let ploidy_estimate = calc_ploidy(peaks, min_volume_fraction);
        let ploidy = if config.peak_ploidy > 0 {
            config.peak_ploidy
        } else {
            ploidy_estimate
        };
        let haploid_peak_center = haploid_peak_center(peaks, ploidy);
        let error_kmers = error_kmers(peaks, hist, min_volume_fraction);
        let genome_size_in_peaks = genome_size_in_peaks(peaks, haploid_peak_center);
        let total_genome_size = genome_size2(peaks, haploid_peak_center, hist);
        let repeat_size = repeat_size(peaks, ploidy, haploid_peak_center);
        let repeat_size2 = repeat_size2(peaks, ploidy, haploid_peak_center, hist);
        let haploid_size = total_genome_size / i64::from(ploidy);
        let het_locs = calc_het_locations(peaks, ploidy, haploid_peak_center, config.k);
        let het_rate = (het_locs as f64 / haploid_size as f64) / 2.0;
        let repeat_rate = repeat_size as f64 / genome_size_in_peaks as f64;
        let repeat_rate2 = repeat_size2 as f64 / total_genome_size as f64;

        let mut main_peak = &peaks[0];
        let mut ploidy_peak = &peaks[0];
        let target = haploid_peak_center * f64::from(ploidy);
        for peak in peaks {
            if peak.volume > main_peak.volume {
                main_peak = peak;
            }
            if abs_diff(peak.center as f64, target) < abs_diff(ploidy_peak.center as f64, target) {
                ploidy_peak = peak;
            }
        }
        let haploid_cov = if target.max(ploidy_peak.center as f64)
            / target.min(ploidy_peak.center as f64)
            < 1.3
        {
            ploidy_peak.center as i64
        } else {
            target as i64
        };

        writeln!(writer, "#k\t{}", config.k)?;
        writeln!(writer, "#unique_kmers\t{unique_kmers}")?;
        writeln!(writer, "#error_kmers\t{error_kmers}")?;
        writeln!(
            writer,
            "#genomic_kmers\t{}",
            unique_kmers as i64 - error_kmers
        )?;
        writeln!(writer, "#main_peak\t{}", main_peak.center)?;
        writeln!(writer, "#genome_size_in_peaks\t{genome_size_in_peaks}")?;
        writeln!(writer, "#genome_size\t{total_genome_size}")?;
        writeln!(writer, "#haploid_genome_size\t{haploid_size}")?;
        writeln!(
            writer,
            "#fold_coverage\t{}",
            java_round_f64(haploid_peak_center)
        )?;
        writeln!(writer, "#haploid_fold_coverage\t{haploid_cov}")?;
        writeln!(writer, "#ploidy\t{ploidy}")?;
        if ploidy != ploidy_estimate {
            writeln!(writer, "#ploidy_detected\t{ploidy_estimate}")?;
        }
        if ploidy > 1 {
            writeln!(writer, "#het_rate\t{het_rate:.5}")?;
        }
        writeln!(
            writer,
            "#percent_repeat_in_peaks\t{:.3}",
            100.0 * repeat_rate
        )?;
        if repeat_size2 >= 0 {
            writeln!(writer, "#percent_repeat\t{:.3}", 100.0 * repeat_rate2)?;
        }
    }

    writeln!(writer, "#start\tcenter\tstop\tmax\tvolume")?;
    for peak in peaks {
        if peak.volume >= config.peak_min_volume {
            writeln!(
                writer,
                "{}\t{}\t{}\t{}\t{}",
                peak.start, peak.center, peak.stop, peak.max_height, peak.volume
            )?;
        }
    }
    Ok(())
}

fn single_copy_kmer_fraction(het_rate: f64, k: usize, ploidy: i32) -> f64 {
    if ploidy < 2 {
        return 1.0;
    }
    let single_copy_kmers = het_rate * k as f64;
    let asymptote = single_copy_kmers / (1.0 + single_copy_kmers);
    asymptote * 2.0
}

fn error_kmers(peaks: &[Peak], hist: &[u64], min_volume_fraction: f64) -> i64 {
    let Some(first) = first_genomic_peak(peaks, min_volume_fraction) else {
        return 0;
    };
    hist.iter().take(first.start).sum::<u64>() as i64
}

fn first_genomic_peak(peaks: &[Peak], min_fraction: f64) -> Option<&Peak> {
    let biggest = &peaks[biggest_peak(peaks)];
    let min_volume = (biggest.volume as f64 * min_fraction) as u64;
    peaks.iter().find(|peak| peak.volume >= min_volume)
}

fn genome_size_in_peaks(peaks: &[Peak], haploid_peak_center: f64) -> i64 {
    let mult = 1.0 / haploid_peak_center.max(1.0);
    peaks
        .iter()
        .map(|peak| peak.volume as i64 * java_round_f64(peak.center as f64 * mult))
        .sum()
}

fn genome_size2(peaks: &[Peak], haploid_peak_center: f64, hist: &[u64]) -> i64 {
    let mult = 1.0 / haploid_peak_center.max(1.0);
    hist.iter()
        .enumerate()
        .skip(peaks[0].start)
        .map(|(i, &count)| count as i64 * java_round_f64(i as f64 * mult).max(1))
        .sum()
}

fn repeat_size(peaks: &[Peak], ploidy: i32, haploid_peak_center: f64) -> i64 {
    if peaks.len() < 2 {
        return 0;
    }
    let mult = 1.0 / haploid_peak_center.max(1.0);
    let homozygous_loc = homozygous_peak(peaks, ploidy, haploid_peak_center);
    peaks
        .iter()
        .skip(homozygous_loc + 1)
        .map(|peak| peak.volume as i64 * (java_round_f64(peak.center as f64 * mult) - 1))
        .sum()
}

fn repeat_size2(peaks: &[Peak], ploidy: i32, haploid_peak_center: f64, hist: &[u64]) -> i64 {
    let mult = 1.0 / haploid_peak_center.max(1.0);
    let mut valley =
        (haploid_peak_center * f64::from(ploidy) * (1.2 + 1.0 / 2.0_f64.max(f64::from(ploidy))))
            .ceil() as usize;
    let homozygous_loc = homozygous_peak(peaks, ploidy, haploid_peak_center);
    if ploidy > 1 {
        valley = peaks[homozygous_loc].stop + 1;
    }
    hist.iter()
        .enumerate()
        .skip(valley)
        .map(|(i, &count)| count as i64 * (java_round_f64(i as f64 * mult) - 1))
        .sum()
}

fn biggest_peak(peaks: &[Peak]) -> usize {
    if peaks.len() < 2 {
        return peaks.len().saturating_sub(1);
    }
    let mut loc = 0;
    let mut biggest = &peaks[0];
    for (i, peak) in peaks.iter().enumerate().skip(1) {
        if peak.volume > biggest.volume {
            loc = i;
            biggest = peak;
        }
    }
    loc
}

fn second_biggest_peak(peaks: &[Peak]) -> usize {
    if peaks.len() < 2 {
        return peaks.len().saturating_sub(1);
    }
    let mut biggest = &peaks[0];
    let mut second = &peaks[1];
    let mut bloc = 0;
    let mut sloc = 1;
    if second.volume > biggest.volume {
        std::mem::swap(&mut second, &mut biggest);
        bloc = 1;
        sloc = 0;
    }
    for (i, peak) in peaks.iter().enumerate().skip(2) {
        if peak.volume > second.volume {
            sloc = i;
            second = peak;
            if second.volume > biggest.volume {
                std::mem::swap(&mut second, &mut biggest);
                sloc = bloc;
                bloc = i;
            }
        }
    }
    sloc
}

fn homozygous_peak(peaks: &[Peak], ploidy: i32, haploid_peak_center: f64) -> usize {
    if peaks.len() < 2 {
        return peaks.len().saturating_sub(1);
    }
    let target = haploid_peak_center * f64::from(ploidy);
    let mut best_diff = f64::from(i32::MAX);
    let mut loc = 0;
    for (i, peak) in peaks.iter().enumerate() {
        let diff = abs_diff(target, peak.center as f64);
        if diff < best_diff {
            best_diff = diff;
            loc = i;
        }
    }
    loc
}

fn haploid_peak_center(peaks: &[Peak], ploidy: i32) -> f64 {
    let biggest = &peaks[biggest_peak(peaks)];
    let second = &peaks[second_biggest_peak(peaks)];
    if second.volume.saturating_mul(4) >= biggest.volume {
        biggest.center.min(second.center) as f64
    } else {
        biggest.center as f64 / f64::from(ploidy)
    }
}

fn calc_ploidy(peaks: &[Peak], min_volume_fraction: f64) -> i32 {
    if peaks.len() < 2 {
        return 1;
    }
    let biggest = &peaks[biggest_peak(peaks)];
    let second = &peaks[second_biggest_peak(peaks)];

    if std::ptr::eq(second, biggest) {
        return 1;
    }
    if second.center < biggest.center {
        if (second.volume as f64) < (biggest.volume as f64) * min_volume_fraction {
            return 1;
        }
    } else if second.volume.saturating_mul(4) < biggest.volume {
        return 1;
    }

    let max = biggest.center.max(second.center);
    let min = biggest.center.min(second.center);
    java_round_f64(max as f64 / min as f64).max(1) as i32
}

fn calc_het_locations(peaks: &[Peak], ploidy: i32, haploid_peak_center: f64, k: usize) -> i64 {
    if peaks.len() < 2 {
        return 0;
    }
    let homozygous_loc = homozygous_peak(peaks, ploidy, haploid_peak_center);
    let homo_peak = &peaks[homozygous_loc];
    let mut sum = 0i64;
    let lim = ploidy / 2;
    for peak in peaks.iter().take(homozygous_loc) {
        let copy_count =
            java_round_f64((peak.center as f64 * f64::from(ploidy)) / homo_peak.center as f64);
        if copy_count > i64::from(lim) {
            break;
        }
        sum += peak.volume as i64;
    }
    sum / k as i64
}

fn smooth_progressive(data: &[u64], radius0: i32) -> Vec<u64> {
    let mut radius = radius0;
    let mut div = i64::from(radius * radius);
    let mut mult = 1.0 / div as f64;
    let mut smoothed = Vec::with_capacity(data.len());
    let mut next = 5i32;
    for i in 0..data.len() {
        let sum = sum_point(data, i, radius);
        smoothed.push(java_round_f64(sum as f64 * mult) as u64);
        if i as i32 > next {
            next = (1.0 + f64::from(next) * PROGRESSIVE_MULT).ceil() as i32;
            radius += 1;
            div = i64::from(radius * radius);
            mult = 1.0 / div as f64;
            if radius > MAX_RADIUS {
                next = i32::MAX;
            }
        }
    }
    smoothed
}

fn sum_point(data: &[u64], loc: usize, radius: i32) -> u64 {
    let mut sum = 0u64;
    let start = loc as i32 - radius + 1;
    let stop = loc as i32 + radius - 1;
    for (x, i) in (start..loc as i32).enumerate() {
        let i2 = i.max(0) as usize;
        sum = sum.saturating_add(data[i2].saturating_mul((x + 1) as u64));
    }
    let max = data.len().saturating_sub(1) as i32;
    let mut x = radius;
    for i in loc as i32..=stop {
        let i2 = i.min(max) as usize;
        sum = sum.saturating_add(data[i2].saturating_mul(x as u64));
        x -= 1;
    }
    sum
}

fn unique_from_raw(depth: usize, raw: u64) -> u64 {
    if depth < 1 {
        raw
    } else {
        (raw + (depth as u64 / 2)) / depth as u64
    }
}

fn java_round_f64(value: f64) -> i64 {
    (value + 0.5).floor() as i64
}

fn abs_diff(a: f64, b: f64) -> f64 {
    (a - b).abs()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn representative_histogram_calls_java_shaped_peak() {
        let config = Config {
            k: 5,
            peak_min_height: 1,
            peak_min_volume: 1,
            peak_min_width: 1,
            peak_min_peak: 1,
            peak_max_peak: 100,
            peak_max_count: 8,
            ..Default::default()
        };
        let mut raw = vec![0; 128];
        raw[20] = 720;
        let unique: Vec<u64> = raw
            .iter()
            .copied()
            .enumerate()
            .map(|(depth, raw)| unique_from_raw(depth, raw))
            .collect();
        let peaks = call_peaks(&unique, &config);
        assert_eq!(peaks.len(), 1);
        assert_eq!(peaks[0].start, 17);
        assert_eq!(peaks[0].center, 20);
        assert_eq!(peaks[0].stop, 23);
        assert_eq!(peaks[0].max_height, 36);
        assert_eq!(peaks[0].volume, 36);
    }
}