use anyhow::{bail, Context, Result};
use log::debug;
use rust_htslib::bam;
use std::collections::HashMap;
use std::io::Write;
use std::path::Path;
use super::cpp_rng::CppMt19937;
use crate::config::PreseqConfig;
#[inline(always)]
fn fnv1a_hash_bytes(bytes: &[u8]) -> u64 {
crate::io::fnv1a(bytes)
}
#[allow(clippy::excessive_precision)]
const LANCZOS_COEFFS: [f64; 9] = [
0.999_999_999_999_809_93,
676.520_368_121_885_1,
-1_259.139_216_722_403,
771.323_428_777_653_1,
-176.615_029_162_140_6,
12.507_343_278_686_905,
-0.138_571_095_265_720_12,
9.984_369_578_019_572e-6,
1.505_632_735_149_311_6e-7,
];
const LANCZOS_G: f64 = 7.0;
const TOLERANCE: f64 = 1e-20;
const MIN_DISTINCT: u64 = 10;
#[cfg(test)]
const DEFAULT_MAX_SEGMENT_LENGTH: i64 = 100_000_000;
#[derive(Debug, Clone)]
struct MateInfo {
tid: i32,
start: i64,
end: i64,
}
#[derive(Debug)]
pub struct PreseqAccum {
fragment_counts: HashMap<u64, u32>,
pub total_fragments: u64,
dangling_mates: HashMap<u64, MateInfo>,
n_paired: u64,
n_unpaired: u64,
n_mates_processed: u64,
max_segment_length: i64,
}
impl PreseqAccum {
pub fn new(max_segment_length: i64) -> Self {
Self {
fragment_counts: HashMap::new(),
total_fragments: 0,
dangling_mates: HashMap::new(),
n_paired: 0,
n_unpaired: 0,
n_mates_processed: 0,
max_segment_length,
}
}
pub fn process_read(&mut self, record: &bam::Record) {
if record.tid() < 0 {
return;
}
if record.is_secondary() || record.is_supplementary() {
return;
}
self.n_mates_processed += 1;
let tid = record.tid();
let start = record.pos();
let end = record.cigar().end_pos();
let info = MateInfo { tid, start, end };
if record.is_paired() {
let qname_hash = fnv1a_hash_bytes(record.qname());
if let Some(mate) = self.dangling_mates.remove(&qname_hash) {
if mate.tid == tid {
let merged_start = mate.start.min(start);
let merged_end = mate.end.max(end);
let frag_len = merged_end - merged_start;
if frag_len >= 0 && frag_len <= self.max_segment_length {
self.add_fragment(tid, merged_start, merged_end);
self.n_paired += 1;
} else {
self.add_fragment(mate.tid, mate.start, mate.end);
self.add_fragment(tid, start, end);
self.n_unpaired += 2;
}
} else {
self.add_fragment(mate.tid, mate.start, mate.end);
self.add_fragment(tid, start, end);
self.n_unpaired += 2;
}
} else {
self.dangling_mates.insert(qname_hash, info);
}
} else {
self.add_fragment_se(tid, start);
self.n_unpaired += 1;
}
}
#[cfg_attr(test, allow(dead_code))]
pub(crate) fn add_fragment(&mut self, tid: i32, start: i64, end: i64) {
let key = fragment_hash(tid, start, end);
*self.fragment_counts.entry(key).or_insert(0) += 1;
self.total_fragments += 1;
}
fn add_fragment_se(&mut self, tid: i32, start: i64) {
let key = fragment_hash_se(tid, start);
*self.fragment_counts.entry(key).or_insert(0) += 1;
self.total_fragments += 1;
}
pub fn merge(&mut self, other: PreseqAccum) {
for (key, count) in other.fragment_counts {
*self.fragment_counts.entry(key).or_insert(0) += count;
}
self.total_fragments += other.total_fragments;
self.n_paired += other.n_paired;
self.n_unpaired += other.n_unpaired;
self.n_mates_processed += other.n_mates_processed;
for (qname, other_mate) in other.dangling_mates {
if let Some(self_mate) = self.dangling_mates.remove(&qname) {
if self_mate.tid == other_mate.tid {
let merged_start = self_mate.start.min(other_mate.start);
let merged_end = self_mate.end.max(other_mate.end);
let frag_len = merged_end - merged_start;
if frag_len >= 0 && frag_len <= self.max_segment_length {
self.add_fragment(self_mate.tid, merged_start, merged_end);
self.n_paired += 1;
} else {
self.add_fragment(self_mate.tid, self_mate.start, self_mate.end);
self.add_fragment(other_mate.tid, other_mate.start, other_mate.end);
self.n_unpaired += 2;
}
} else {
self.add_fragment(self_mate.tid, self_mate.start, self_mate.end);
self.add_fragment(other_mate.tid, other_mate.start, other_mate.end);
self.n_unpaired += 2;
}
} else {
self.dangling_mates.insert(qname, other_mate);
}
}
}
pub fn finalize(&mut self) {
let n_dangling = self.dangling_mates.len();
let dangling: Vec<MateInfo> = self.dangling_mates.drain().map(|(_, v)| v).collect();
for mate in dangling {
self.add_fragment(mate.tid, mate.start, mate.end);
self.n_unpaired += 1;
}
debug!(
"preseq finalize: mates_processed={}, n_paired={}, n_unpaired={}, dangling_flushed={}, total_fragments={}, n_distinct={}",
self.n_mates_processed, self.n_paired, self.n_unpaired, n_dangling, self.total_fragments, self.fragment_counts.len()
);
}
pub fn into_histogram(self) -> Vec<(u64, u64)> {
let mut freq_of_freq: HashMap<u64, u64> = HashMap::new();
for &count in self.fragment_counts.values() {
*freq_of_freq.entry(count as u64).or_insert(0) += 1;
}
let mut hist: Vec<(u64, u64)> = freq_of_freq.into_iter().collect();
hist.sort_by_key(|&(j, _)| j);
hist
}
pub fn n_distinct(&self) -> u64 {
self.fragment_counts.len() as u64
}
}
fn fragment_hash(tid: i32, start: i64, end: i64) -> u64 {
let mut buf = [0u8; 24];
buf[0..8].copy_from_slice(&(tid as u64).to_le_bytes());
buf[8..16].copy_from_slice(&(start as u64).to_le_bytes());
buf[16..24].copy_from_slice(&(end as u64).to_le_bytes());
crate::io::fnv1a(&buf)
}
fn fragment_hash_se(tid: i32, start: i64) -> u64 {
let mut buf = [0u8; 16];
buf[0..8].copy_from_slice(&(tid as u64).to_le_bytes());
buf[8..16].copy_from_slice(&(start as u64).to_le_bytes());
crate::io::fnv1a(&buf)
}
fn ln_gamma(x: f64) -> f64 {
if x <= 0.0 {
return f64::INFINITY;
}
let x = x - 1.0;
let mut sum = LANCZOS_COEFFS[0];
for (i, &coeff) in LANCZOS_COEFFS.iter().enumerate().skip(1) {
sum += coeff / (x + i as f64);
}
let t = x + LANCZOS_G + 0.5;
0.5 * (2.0 * std::f64::consts::PI).ln() + (t.ln() * (x + 0.5)) - t + sum.ln()
}
fn ln_binomial(n: u64, k: u64) -> f64 {
if k > n {
return f64::NEG_INFINITY;
}
if k == 0 || k == n {
return 0.0;
}
ln_gamma(n as f64 + 1.0) - ln_gamma(k as f64 + 1.0) - ln_gamma((n - k) as f64 + 1.0)
}
fn interpolate(histogram: &[(u64, u64)], total_reads: u64, n_distinct: u64, target: f64) -> f64 {
let n = total_reads;
let t = target as u64;
if t >= n {
return n_distinct as f64;
}
let max_j = histogram.iter().map(|&(j, _)| j).max().unwrap_or(0) as usize;
let mut dense_hist = vec![0u64; max_j + 1];
for &(j, n_j) in histogram {
if (j as usize) < dense_hist.len() {
dense_hist[j as usize] = n_j;
}
}
let denom = ln_binomial(n, t);
let mut acc: i32 = 0; for (i, &n_j) in dense_hist.iter().enumerate().skip(1) {
if n_j == 0 {
continue; }
let numer_val = if n < (i as u64) + t {
0.0
} else {
let x = ln_binomial(n - i as u64, t);
(x - denom).exp() * n_j as f64
};
acc = (acc as f64 + numer_val) as i32;
}
n_distinct as f64 - acc as f64
}
fn power_series_coeffs(histogram: &[(u64, u64)], max_terms: usize) -> Vec<f64> {
let max_j = histogram.iter().map(|&(j, _)| j).max().unwrap_or(0) as usize;
let mut nj = vec![0u64; max_j + 2];
for &(j, n_j) in histogram {
if (j as usize) < nj.len() {
nj[j as usize] = n_j;
}
}
let n_terms = max_terms.min(max_j);
let mut coeffs = Vec::with_capacity(n_terms);
for j in 0..n_terms {
let freq = j + 1; let n_val = if freq < nj.len() { nj[freq] } else { 0 };
let sign = if j % 2 == 0 { 1.0 } else { -1.0 };
coeffs.push(sign * n_val as f64);
}
coeffs
}
fn qd_algorithm(ps_coeffs: &[f64]) -> Option<Vec<f64>> {
let n = ps_coeffs.len();
if n < 2 {
return ps_coeffs.first().map(|&c| vec![c]);
}
let depth = n;
let width = depth + 1;
let mut q_table = vec![0.0f64; depth * width];
let mut e_table = vec![0.0f64; depth * width];
let idx = |row: usize, col: usize| -> usize { row * width + col };
for j in 0..depth.saturating_sub(1) {
if ps_coeffs[j].abs() < TOLERANCE {
return None;
}
if j + 1 < n {
q_table[idx(1, j)] = ps_coeffs[j + 1] / ps_coeffs[j];
}
}
for j in 0..depth.saturating_sub(1) {
e_table[idx(1, j)] = q_table[idx(1, j + 1)] - q_table[idx(1, j)] + e_table[idx(0, j + 1)];
}
for i in 2..depth {
for j in 0..depth {
let denom = e_table[idx(i - 1, j)];
if denom.abs() < TOLERANCE {
break;
}
q_table[idx(i, j)] = q_table[idx(i - 1, j + 1)] * e_table[idx(i - 1, j + 1)] / denom;
}
for j in 0..depth {
e_table[idx(i, j)] =
q_table[idx(i, j + 1)] - q_table[idx(i, j)] + e_table[idx(i - 1, j + 1)];
}
}
let mut cf_coeffs = Vec::with_capacity(depth);
cf_coeffs.push(ps_coeffs[0]);
for i in 1..depth {
if i % 2 != 0 {
cf_coeffs.push(-q_table[idx(i.div_ceil(2), 0)]);
} else {
cf_coeffs.push(-e_table[idx(i / 2, 0)]);
}
}
Some(cf_coeffs)
}
fn evaluate_cf(cf_coeffs: &[f64], t: f64, degree: usize) -> f64 {
let n = degree.min(cf_coeffs.len());
if n == 0 {
return f64::NAN;
}
let mut prev_num = 0.0_f64;
let mut curr_num = cf_coeffs[0];
let mut prev_den = 1.0_f64;
let mut curr_den = 1.0_f64;
let limit = n.min(cf_coeffs.len());
for &cf_coeff in cf_coeffs.iter().take(limit).skip(1) {
let cf_val = cf_coeff * t;
let new_num = curr_num + cf_val * prev_num;
let new_den = curr_den + cf_val * prev_den;
prev_num = curr_num;
curr_num = new_num;
prev_den = curr_den;
curr_den = new_den;
let rescale_factor = curr_num.abs() + curr_den.abs();
if rescale_factor >= 1.0 / TOLERANCE {
curr_num /= rescale_factor;
curr_den /= rescale_factor;
prev_num /= rescale_factor;
prev_den /= rescale_factor;
}
if rescale_factor <= TOLERANCE {
curr_num /= rescale_factor;
curr_den /= rescale_factor;
prev_num /= rescale_factor;
prev_den /= rescale_factor;
}
}
curr_num / curr_den
}
fn select_degree(
cf_coeffs: &[f64],
_total_reads: u64,
max_terms: usize,
_max_extrap: f64,
) -> Option<usize> {
let search_max = 100.0;
let search_step = 0.05f64;
let n_test = (search_max / search_step).ceil() as usize;
let check_degree = |degree: usize| -> bool {
if degree > cf_coeffs.len() || degree == 0 {
return false;
}
let mut prev_val = 0.0f64;
let mut prev_deriv = 0.0f64;
for i in 1..=n_test {
let t = search_step * i as f64;
let cf_val = evaluate_cf(cf_coeffs, t, degree);
let val = t * cf_val;
if val < 0.0 || !val.is_finite() {
return false;
}
if val < prev_val {
return false;
}
if i >= 2 {
let curr_deriv = val - prev_val;
if curr_deriv > prev_deriv {
return false;
}
prev_deriv = curr_deriv;
} else {
prev_deriv = val - prev_val;
}
prev_val = val;
}
true
};
if (3..=6).contains(&max_terms) {
if check_degree(max_terms) {
debug!(
"Selected CF degree {} for extrapolation (small max_terms)",
max_terms
);
return Some(max_terms);
}
return None;
}
let start = 7 + if max_terms.is_multiple_of(2) { 1 } else { 0 };
let mut degree = start;
while degree <= max_terms {
if check_degree(degree) {
debug!("Selected CF degree {} for extrapolation", degree);
return Some(degree);
}
degree += 2;
}
None
}
fn extrapolate(
cf_coeffs: &[f64],
degree: usize,
total_reads: u64,
n_distinct: u64,
target: f64,
) -> f64 {
let n = total_reads as f64;
let s = n_distinct as f64;
let fold = (target - n) / n;
let cf_val = evaluate_cf(cf_coeffs, fold, degree);
s + fold * cf_val
}
fn power_series_coeffs_defects(
histogram: &[(u64, u64)],
_total_reads: u64,
max_terms: usize,
) -> Vec<f64> {
let max_j = histogram.iter().map(|&(j, _)| j).max().unwrap_or(0) as usize;
let mut nj = vec![0u64; max_j + 2];
for &(j, n_j) in histogram {
if (j as usize) < nj.len() {
nj[j as usize] = n_j;
}
}
let n_terms = max_terms.min(max_j);
let mut coeffs = Vec::with_capacity(n_terms);
for j in 0..n_terms {
let freq = j + 1;
let n_val = if freq < nj.len() {
nj[freq] as f64
} else {
0.0
};
let sign = if j % 2 == 0 { 1.0 } else { -1.0 };
coeffs.push(sign * n_val);
}
coeffs
}
fn bootstrap_resample(
histogram: &[(u64, u64)],
rng: &mut CppMt19937,
) -> (Vec<(u64, u64)>, u64, u64) {
let mut hist_indices: Vec<u64> = Vec::new();
let mut hist_weights: Vec<u64> = Vec::new();
for &(j, n_j) in histogram {
if n_j > 0 {
hist_indices.push(j);
hist_weights.push(n_j);
}
}
let n_distinct: u64 = hist_weights.iter().sum();
if n_distinct == 0 || hist_weights.is_empty() {
return (Vec::new(), 0, 0);
}
let mut remaining_trials = n_distinct;
let mut remaining_prob: f64 = hist_weights.iter().sum::<u64>() as f64;
let mut sample = vec![0u64; hist_indices.len()];
for i in 0..hist_indices.len() {
let p = (hist_weights[i] as f64) / remaining_prob;
let drawn = rng.binomial(remaining_trials, p);
sample[i] = drawn;
remaining_prob -= hist_weights[i] as f64;
remaining_trials -= drawn;
}
let mut freq_of_freq: HashMap<u64, u64> = HashMap::new();
let mut resample_total = 0u64;
let mut resample_distinct = 0u64;
for (k, &count) in sample.iter().enumerate() {
if count > 0 {
let freq = hist_indices[k];
*freq_of_freq.entry(freq).or_insert(0) += count;
resample_total += freq * count;
resample_distinct += count;
}
}
let mut hist: Vec<(u64, u64)> = freq_of_freq.into_iter().collect();
hist.sort_by_key(|&(j, _)| j);
(hist, resample_total, resample_distinct)
}
#[derive(Debug)]
pub struct PreseqResult {
pub curve: Vec<(f64, f64, f64, f64)>,
}
fn good_toulmin_2x_extrap(histogram: &[(u64, u64)]) -> f64 {
let mut result = 0.0f64;
for &(j, n_j) in histogram {
if j == 0 {
continue;
}
let sign = if j % 2 == 1 { 1.0 } else { -1.0 };
result += sign * n_j as f64;
}
result
}
pub fn estimate_complexity(
histogram: &[(u64, u64)],
total_reads: u64,
n_distinct: u64,
config: &PreseqConfig,
) -> Result<PreseqResult> {
if n_distinct < MIN_DISTINCT {
bail!(
"Too few distinct molecules ({}) for library complexity estimation (minimum: {})",
n_distinct,
MIN_DISTINCT
);
}
debug!(
"Library complexity estimation: {} total reads, {} distinct",
total_reads, n_distinct
);
let gt2x = good_toulmin_2x_extrap(histogram);
if gt2x < 0.0 {
bail!(
"Good-Toulmin 2x extrapolation check failed (value={:.2}). \
Library complexity estimation is unreliable for this sample.",
gt2x
);
}
let max_extrap = config.max_extrap;
let step = config.step_size;
let mut targets: Vec<f64> = Vec::new();
let mut t = step;
while t < max_extrap {
targets.push(t);
t += step;
}
let n_bootstraps = config.n_bootstraps;
if n_bootstraps == 0 {
let point_curve = compute_curve(
histogram,
total_reads,
n_distinct,
n_distinct,
&targets,
config.max_terms,
max_extrap,
config.defects,
)?;
let curve: Vec<(f64, f64, f64, f64)> = point_curve
.iter()
.map(|&(t, e)| (t, e, f64::NAN, f64::NAN))
.collect();
return Ok(PreseqResult { curve });
}
debug!("Running {} bootstrap replicates...", n_bootstraps);
let boot_max_terms = config.max_terms;
let boot_defects = config.defects;
let n_targets = targets.len();
let mut rng = CppMt19937::new(config.seed as u32);
let mut bootstrap_curves: Vec<Vec<f64>> = vec![Vec::new(); n_targets];
let mut successful_bootstraps = 0u32;
let max_iter = 100 * n_bootstraps as u64;
let mut iter_count = 0u64;
#[cfg(test)]
let mut reject_empty = 0u64;
#[cfg(test)]
let mut reject_unstable = 0u64;
#[cfg(test)]
let mut accept_count = 0u64;
while successful_bootstraps < n_bootstraps && iter_count < max_iter {
iter_count += 1;
let (boot_hist, boot_total, boot_distinct) = bootstrap_resample(histogram, &mut rng);
if boot_total == 0 {
continue;
}
if let Ok(boot_curve) = compute_curve(
&boot_hist,
boot_total,
boot_distinct, n_distinct, &targets,
boot_max_terms,
max_extrap,
boot_defects,
) {
if !boot_curve.is_empty() {
let yield_vector: Vec<f64> = boot_curve.iter().map(|&(_, e)| e).collect();
if check_yield_estimates_stability(&yield_vector) {
for (i, &(_, expected)) in boot_curve.iter().enumerate() {
if i < bootstrap_curves.len() {
bootstrap_curves[i].push(expected);
}
}
successful_bootstraps += 1;
#[cfg(test)]
{
accept_count += 1;
}
} else {
#[cfg(test)]
{
reject_unstable += 1;
}
}
} else {
#[cfg(test)]
{
reject_empty += 1;
}
}
}
}
#[cfg(test)]
eprintln!(
"DIAGNOSTIC: bootstrap stats: iter={}, accepted={}, rejected_empty={}, rejected_unstable={}",
iter_count, accept_count, reject_empty, reject_unstable
);
let alpha = 1.0 - config.confidence_level;
let lower_q = alpha / 2.0;
let upper_q = 1.0 - alpha / 2.0;
let curve: Vec<(f64, f64, f64, f64)> = targets
.iter()
.enumerate()
.map(|(i, &target)| {
if i < bootstrap_curves.len() && bootstrap_curves[i].len() >= 2 {
let mut vals = bootstrap_curves[i].clone();
vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = median_sorted(&vals);
let lower = quantile(&vals, lower_q);
let upper = quantile(&vals, upper_q);
(target, median, lower, upper)
} else if i < bootstrap_curves.len() && bootstrap_curves[i].len() == 1 {
let val = bootstrap_curves[i][0];
(target, val, f64::NAN, f64::NAN)
} else {
(target, f64::NAN, f64::NAN, f64::NAN)
}
})
.collect();
Ok(PreseqResult { curve })
}
fn median_sorted(sorted: &[f64]) -> f64 {
let n = sorted.len();
if n == 0 {
return f64::NAN;
}
if n % 2 == 1 {
sorted[n / 2]
} else {
(sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
}
}
fn check_yield_estimates_stability(estimates: &[f64]) -> bool {
if estimates.is_empty() {
return false;
}
for &e in estimates {
if !e.is_finite() || e < 0.0 {
return false;
}
}
for i in 1..estimates.len() {
if estimates[i] < estimates[i - 1] {
return false;
}
}
for i in 2..estimates.len() {
if estimates[i] - estimates[i - 1] > estimates[i - 1] - estimates[i - 2] {
return false;
}
}
true
}
#[allow(clippy::too_many_arguments)]
fn compute_curve(
histogram: &[(u64, u64)],
total_reads: u64,
interp_distinct: u64,
extrap_distinct: u64,
targets: &[f64],
max_terms: usize,
max_extrap: f64,
use_defects: bool,
) -> Result<Vec<(f64, f64)>> {
let n = total_reads as f64;
let max_j = histogram.iter().map(|&(j, _)| j).max().unwrap_or(0) as usize;
let first_zero = {
let mut fz = max_j + 1; for j in 1..=max_j {
if !histogram.iter().any(|&(mult, _)| mult == j as u64) {
fz = j;
break;
}
}
fz
};
let adjusted_max_terms = if first_zero > 1 {
let capped = max_terms.min(first_zero - 1);
capped - (capped % 2)
} else {
0
};
let max_terms = adjusted_max_terms;
let ps_coeffs = if use_defects {
power_series_coeffs_defects(histogram, total_reads, max_terms)
} else {
power_series_coeffs(histogram, max_terms)
};
let cf_coeffs = qd_algorithm(&ps_coeffs)
.context("Failed to compute continued fraction coefficients via QD algorithm")?;
let degree = match select_degree(&cf_coeffs, total_reads, max_terms, max_extrap) {
Some(d) => d,
None => return Ok(Vec::new()),
};
let mut curve = Vec::with_capacity(targets.len());
for &target in targets {
let expected = if target < n {
interpolate(histogram, total_reads, interp_distinct, target)
} else {
extrapolate(&cf_coeffs, degree, total_reads, extrap_distinct, target)
};
curve.push((target, expected));
}
Ok(curve)
}
fn quantile(sorted: &[f64], q: f64) -> f64 {
if sorted.is_empty() {
return f64::NAN;
}
if sorted.len() == 1 {
return sorted[0];
}
let n = sorted.len() as f64;
let index = q * (n - 1.0);
let lo = index.floor() as usize;
let hi = index.ceil() as usize;
let frac = index - lo as f64;
if hi >= sorted.len() {
sorted[sorted.len() - 1]
} else {
sorted[lo] * (1.0 - frac) + sorted[hi] * frac
}
}
pub fn write_output(
result: &PreseqResult,
output_path: &Path,
confidence_level: f64,
) -> Result<()> {
let mut f = std::fs::File::create(output_path)
.with_context(|| format!("Failed to create preseq output: {}", output_path.display()))?;
writeln!(
f,
"TOTAL_READS\tEXPECTED_DISTINCT\tLOWER_{:.2}CI\tUPPER_{:.2}CI",
confidence_level, confidence_level
)?;
writeln!(f, "0\t0\t0\t0")?;
for &(total, expected, lower, upper) in &result.curve {
if lower.is_nan() || upper.is_nan() {
writeln!(f, "{:.1}\t{:.1}\t-\t-", total, expected)?;
} else {
writeln!(
f,
"{:.1}\t{:.1}\t{:.1}\t{:.1}",
total, expected, lower, upper
)?;
}
}
debug!("Wrote preseq output to {}", output_path.display());
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ln_gamma_known_values() {
assert!((ln_gamma(1.0) - 0.0).abs() < 1e-10);
assert!((ln_gamma(2.0) - 0.0).abs() < 1e-10);
assert!((ln_gamma(5.0) - (24.0f64).ln()).abs() < 1e-10);
let expected = std::f64::consts::PI.sqrt().ln();
assert!(
(ln_gamma(0.5) - expected).abs() < 1e-10,
"ln_gamma(0.5) = {}, expected {}",
ln_gamma(0.5),
expected
);
}
#[test]
fn test_ln_binomial() {
assert!((ln_binomial(10, 0) - 0.0).abs() < 1e-10);
assert!((ln_binomial(10, 10) - 0.0).abs() < 1e-10);
assert!(
(ln_binomial(10, 5) - (252.0f64).ln()).abs() < 1e-8,
"ln_binomial(10,5) = {}, expected {}",
ln_binomial(10, 5),
(252.0f64).ln()
);
assert!(
(ln_binomial(20, 10) - (184756.0f64).ln()).abs() < 1e-6,
"ln_binomial(20,10) = {}, expected {}",
ln_binomial(20, 10),
(184756.0f64).ln()
);
assert!(ln_binomial(5, 10).is_infinite());
}
#[test]
fn test_fragment_hash_deterministic() {
let h1 = fragment_hash(1, 200, 300);
let h2 = fragment_hash(1, 200, 300);
assert_eq!(h1, h2);
let h3 = fragment_hash(1, 100, 300);
assert_ne!(h1, h3);
}
#[test]
fn test_se_fragment_hash_ignores_end() {
let h1 = fragment_hash_se(1, 200);
let h2 = fragment_hash_se(1, 200);
assert_eq!(h1, h2);
let h3 = fragment_hash_se(1, 300);
assert_ne!(h1, h3);
}
#[test]
fn test_se_fragments_collapse_same_start_different_end() {
let mut accum = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum.add_fragment_se(0, 100); accum.add_fragment_se(0, 100); accum.add_fragment_se(0, 100);
accum.add_fragment_se(0, 200);
assert_eq!(accum.total_fragments, 4);
assert_eq!(accum.n_distinct(), 2, "SE: same start = same fragment");
let hist = accum.into_histogram();
assert!(hist.contains(&(1, 1)), "One singleton");
assert!(hist.contains(&(3, 1)), "One triplet");
}
#[test]
fn test_histogram_construction() {
let mut accum = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum.add_fragment(0, 100, 200); accum.add_fragment(0, 200, 300); accum.add_fragment(0, 300, 400); accum.add_fragment(0, 400, 500); accum.add_fragment(0, 400, 500);
accum.add_fragment(0, 500, 600); accum.add_fragment(0, 500, 600);
accum.add_fragment(0, 600, 700); accum.add_fragment(0, 600, 700);
accum.add_fragment(0, 600, 700);
assert_eq!(accum.total_fragments, 10);
assert_eq!(accum.n_distinct(), 6);
let hist = accum.into_histogram();
assert_eq!(hist.len(), 3);
assert_eq!(hist[0], (1, 3)); assert_eq!(hist[1], (2, 2)); assert_eq!(hist[2], (3, 1)); }
#[test]
fn test_fragment_counting() {
let mut accum = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum.add_fragment(0, 100, 200);
assert_eq!(accum.total_fragments, 1);
accum.add_fragment(0, 300, 400);
assert_eq!(accum.total_fragments, 2);
accum.add_fragment(0, 300, 400);
assert_eq!(accum.total_fragments, 3);
assert_eq!(accum.n_distinct(), 2); }
#[test]
fn test_interleaved_fragments_grouped_correctly() {
let mut accum = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum.add_fragment(0, 100, 200); accum.add_fragment(0, 100, 300); accum.add_fragment(0, 100, 200);
assert_eq!(accum.total_fragments, 3);
assert_eq!(accum.n_distinct(), 2, "2 distinct (A and B)");
let hist = accum.into_histogram();
assert_eq!(hist.len(), 2);
assert!(hist.contains(&(1, 1)), "One singleton (B)");
assert!(hist.contains(&(2, 1)), "One duplicate pair (A)");
}
#[test]
fn test_accumulator_merge_fragment_counts() {
let mut accum1 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum1.add_fragment(0, 100, 200);
accum1.add_fragment(0, 200, 300);
let mut accum2 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum2.add_fragment(0, 100, 200); accum2.add_fragment(0, 300, 400);
accum1.merge(accum2);
assert_eq!(accum1.total_fragments, 4);
assert_eq!(accum1.n_distinct(), 3); }
#[test]
fn test_finalize_flushes_dangling_mates() {
let mut accum = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 100,
end: 200,
},
);
assert_eq!(accum.total_fragments, 0);
accum.finalize();
assert_eq!(accum.total_fragments, 1);
assert_eq!(accum.n_unpaired, 1);
assert!(accum.dangling_mates.is_empty());
}
#[test]
fn test_merge_resolves_cross_chromosome_mates() {
let mut accum1 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
let mut accum2 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum1.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 100,
end: 200,
},
);
accum2.dangling_mates.insert(
1u64,
MateInfo {
tid: 1,
start: 300,
end: 400,
},
);
accum1.merge(accum2);
assert_eq!(accum1.total_fragments, 2);
assert_eq!(accum1.n_unpaired, 2);
assert!(accum1.dangling_mates.is_empty());
}
#[test]
fn test_merge_resolves_same_chromosome_mates() {
let mut accum1 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
let mut accum2 = PreseqAccum::new(DEFAULT_MAX_SEGMENT_LENGTH);
accum1.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 100,
end: 200,
},
);
accum2.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 150,
end: 350,
},
);
accum1.merge(accum2);
assert_eq!(accum1.total_fragments, 1);
assert_eq!(accum1.n_paired, 1);
assert!(accum1.dangling_mates.is_empty());
}
#[test]
fn test_merge_rejects_long_fragments() {
let mut accum1 = PreseqAccum::new(5000); let mut accum2 = PreseqAccum::new(5000);
accum1.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 0,
end: 100,
},
);
accum2.dangling_mates.insert(
1u64,
MateInfo {
tid: 0,
start: 6000,
end: 6100,
},
);
accum1.merge(accum2);
assert_eq!(accum1.total_fragments, 2);
assert_eq!(accum1.n_unpaired, 2);
assert!(accum1.dangling_mates.is_empty());
}
#[test]
fn test_interpolation_identity() {
let hist = vec![(1, 50), (2, 30), (3, 20)];
let total_reads = 150; let n_distinct = 100; let result = interpolate(&hist, total_reads, n_distinct, total_reads as f64);
assert!(
(result - n_distinct as f64).abs() < 1.0,
"At N, interpolation should ≈ S, got {}",
result
);
}
#[test]
fn test_interpolation_monotone() {
let hist = vec![(1, 100), (2, 50), (3, 25), (4, 10)];
let total_reads = 335; let n_distinct = 185;
let mut prev = 0.0;
for target in (10..=total_reads).step_by(10) {
let val = interpolate(&hist, total_reads, n_distinct, target as f64);
assert!(
val >= prev - 1e-6,
"Interpolation should be monotone: {} < {} at target={}",
val,
prev,
target
);
prev = val;
}
}
#[test]
fn test_power_series_coeffs() {
let hist = vec![(1, 100), (2, 50), (3, 25)];
let coeffs = power_series_coeffs(&hist, 10);
assert!((coeffs[0] - 100.0).abs() < 1e-10);
assert!((coeffs[1] - (-50.0)).abs() < 1e-10);
assert!((coeffs[2] - 25.0).abs() < 1e-10);
}
#[test]
fn test_qd_algorithm_produces_cf() {
let hist = vec![(1, 1000), (2, 500), (3, 250), (4, 125), (5, 60)];
let ps = power_series_coeffs(&hist, 10);
let cf = qd_algorithm(&ps);
assert!(
cf.is_some(),
"QD algorithm should succeed for reasonable input"
);
let cf = cf.unwrap();
assert!(!cf.is_empty(), "CF coefficients should not be empty");
}
#[test]
fn test_cf_evaluation() {
let cf_coeffs = vec![100.0];
let result = evaluate_cf(&cf_coeffs, 0.5, 1);
assert!(
(result - 100.0).abs() < 1e-6,
"evaluate_cf with single coeff should give c=100, got {}",
result
);
let cf_coeffs3 = vec![100.0, 0.5];
let result3 = evaluate_cf(&cf_coeffs3, 1.0, 2);
assert!(
(result3 - 100.0 / 1.5).abs() < 1e-6,
"Two-term CF should give 100/1.5={}, got {}",
100.0 / 1.5,
result3
);
}
#[test]
fn test_quantile_computation() {
let vals = vec![1.0, 2.0, 3.0, 4.0, 5.0];
assert!((quantile(&vals, 0.0) - 1.0).abs() < 1e-10);
assert!((quantile(&vals, 0.5) - 3.0).abs() < 1e-10);
assert!((quantile(&vals, 1.0) - 5.0).abs() < 1e-10);
assert!((quantile(&vals, 0.25) - 2.0).abs() < 1e-10);
}
#[test]
fn test_bootstrap_reproducibility() {
let hist = vec![(1, 100), (2, 50), (3, 25)];
let _total_reads = 225;
let mut rng1 = CppMt19937::new(42);
let (h1, t1, d1) = bootstrap_resample(&hist, &mut rng1);
let mut rng2 = CppMt19937::new(42);
let (h2, t2, d2) = bootstrap_resample(&hist, &mut rng2);
assert_eq!(t1, t2, "Same seed should give same total");
assert_eq!(d1, d2, "Same seed should give same distinct");
assert_eq!(h1, h2, "Same seed should give same histogram");
}
#[test]
fn test_curve_monotone_increasing() {
let hist: Vec<(u64, u64)> = vec![
(1, 5000),
(2, 2500),
(3, 1250),
(4, 625),
(5, 300),
(6, 150),
(7, 75),
(8, 35),
(9, 18),
(10, 9),
];
let total_reads: u64 = hist.iter().map(|&(j, n_j)| j * n_j).sum();
let n_distinct: u64 = hist.iter().map(|&(_, n_j)| n_j).sum();
let config = PreseqConfig {
enabled: true,
max_extrap: total_reads as f64 * 10.0,
step_size: total_reads as f64 / 10.0,
n_bootstraps: 0, confidence_level: 0.95,
seed: 1,
max_terms: 100,
max_segment_length: 100_000_000,
defects: false,
};
let result = estimate_complexity(&hist, total_reads, n_distinct, &config);
assert!(result.is_ok(), "Estimation should succeed: {:?}", result);
let result = result.unwrap();
let mut prev = 0.0;
for &(_, expected, _, _) in &result.curve {
assert!(
expected >= prev - 1e-6,
"Curve should be monotonically increasing: {} < {}",
expected,
prev
);
prev = expected;
}
}
#[test]
fn test_benchmark_histogram_intermediates() {
let hist: Vec<(u64, u64)> = vec![
(1, 23425),
(2, 443),
(3, 69),
(4, 28),
(5, 14),
(6, 5),
(8, 3),
(9, 2),
(10, 1),
(18, 1),
];
let total_reads: u64 = 24800;
let n_distinct: u64 = 23991;
let hist_total: u64 = hist.iter().map(|&(j, n)| j * n).sum();
let hist_distinct: u64 = hist.iter().map(|&(_, n)| n).sum();
assert_eq!(hist_total, total_reads);
assert_eq!(hist_distinct, n_distinct);
let first_zero = {
let max_j = hist.iter().map(|&(j, _)| j).max().unwrap() as usize;
let mut nj = vec![0u64; max_j + 2];
for &(j, n) in &hist {
nj[j as usize] = n;
}
(1..=max_j).find(|&j| nj[j] == 0).unwrap_or(max_j + 1)
};
assert_eq!(first_zero, 7);
let max_terms = {
let mt = std::cmp::min(100usize, first_zero - 1);
mt - (mt % 2)
};
assert_eq!(max_terms, 6);
let ps_coeffs = power_series_coeffs(&hist, max_terms);
assert_eq!(ps_coeffs, vec![23425.0, -443.0, 69.0, -28.0, 14.0, -5.0]);
let cf_coeffs = qd_algorithm(&ps_coeffs).expect("QD should succeed");
assert_eq!(cf_coeffs.len(), 6);
let degree = select_degree(&cf_coeffs, total_reads, max_terms, 1e10);
assert!(
degree.is_none(),
"Degree selection should fail for this histogram (matches upstream)"
);
let interp_result = interpolate(&hist, total_reads, n_distinct, 12000.0);
assert!(
interp_result > 11700.0 && interp_result < 11800.0,
"Interpolation at 12000 should be ~11770, got {}",
interp_result
);
let config = PreseqConfig {
enabled: true,
max_extrap: 1e10,
step_size: 1e6,
n_bootstraps: 100,
confidence_level: 0.95,
seed: 1,
max_terms: 100,
max_segment_length: 100_000_000,
defects: false,
};
let result = estimate_complexity(&hist, total_reads, n_distinct, &config);
assert!(result.is_ok(), "estimate_complexity failed: {:?}", result);
let result = result.unwrap();
assert_eq!(result.curve.len(), 9999, "Expected 9999 curve points");
let (_, expected_1m, lower_1m, upper_1m) = result.curve[0];
assert!(
expected_1m > 500_000.0 && expected_1m < 800_000.0,
"Expected ~600k at 1M, got {:.1}",
expected_1m
);
assert!(lower_1m < expected_1m, "Lower CI should be below estimate");
assert!(upper_1m > expected_1m, "Upper CI should be above estimate");
eprintln!(
"Benchmark comparison at 1M: expected={:.1} lower={:.1} upper={:.1} (upstream: 613221.1 / 499291.6 / 751901.7)",
expected_1m, lower_1m, upper_1m
);
}
#[test]
fn test_full_pipeline_with_bootstrap() {
let hist: Vec<(u64, u64)> = vec![(1, 5000), (2, 2500), (3, 1250), (4, 625), (5, 300)];
let total_reads: u64 = hist.iter().map(|&(j, n_j)| j * n_j).sum();
let n_distinct: u64 = hist.iter().map(|&(_, n_j)| n_j).sum();
let config = PreseqConfig {
enabled: true,
max_extrap: total_reads as f64 * 5.0,
step_size: total_reads as f64,
n_bootstraps: 10,
confidence_level: 0.95,
seed: 1,
max_terms: 50,
max_segment_length: 100_000_000,
defects: false,
};
let result = estimate_complexity(&hist, total_reads, n_distinct, &config);
assert!(result.is_ok(), "Full pipeline should succeed: {:?}", result);
let result = result.unwrap();
assert!(!result.curve.is_empty(), "Curve should not be empty");
for &(_, _expected, lower, upper) in &result.curve {
if !lower.is_nan() && !upper.is_nan() {
assert!(
lower <= upper + 1e-6,
"Lower CI ({}) should be ≤ upper CI ({})",
lower,
upper
);
}
}
}
}